feat: config-driven architecture, install wizard, live runtime switching, usage tracking, auto-failover
Major changes: - Config-driven adapters: all channels (Slack, Discord, Telegram, WebChat, Webhooks) controlled via config.json with enabled flags and token auto-detection, no CLI flags required - Runtime engine field: runtime.engine selects opencode/claude from config - Interactive install script: 8-phase setup wizard with AI runtime detection/installation, token setup, identity file personalization (personality presets), aetheel CLI command, background service (launchd/systemd) - Live runtime switching: /engine, /model, /provider commands hot-swap the AI runtime from chat without restart, changes persisted to config.json - Usage tracking: per-request cost extraction from Claude Code JSON output, cumulative stats via /usage command - Auto-failover: rate limit detection on both runtimes, automatic switch to other engine on quota errors with user notification - Chat commands work without / prefix (Slack intercepts / in channels), commands: engine, model, provider, config, usage, reload, cron, subagents, status, help - /config set for editing config.json from chat with dotted key notation - Security audit saved to docs/security-audit.md - Full command reference in docs/commands.md - Future changes doc with NanoClaw agent teams analysis - Logo added to README and WebChat UI - README fully rewritten with all features documented
This commit is contained in:
@@ -32,7 +32,9 @@ Usage:
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
@@ -44,6 +46,32 @@ from typing import Any, Callable
|
||||
logger = logging.getLogger("aetheel.agent")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rate Limit Detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_RATE_LIMIT_PATTERNS = [
|
||||
"rate limit",
|
||||
"rate_limit",
|
||||
"too many requests",
|
||||
"429",
|
||||
"quota exceeded",
|
||||
"usage limit",
|
||||
"capacity",
|
||||
"overloaded",
|
||||
"credit balance",
|
||||
"billing",
|
||||
"exceeded your",
|
||||
"max usage",
|
||||
]
|
||||
|
||||
|
||||
def _is_rate_limited(text: str) -> bool:
|
||||
"""Check if an error message indicates a rate limit or quota issue."""
|
||||
lower = text.lower()
|
||||
return any(pattern in lower for pattern in _RATE_LIMIT_PATTERNS)
|
||||
|
||||
|
||||
def _resolve_opencode_command(explicit: str | None = None) -> str:
|
||||
"""
|
||||
Resolve the opencode binary path.
|
||||
@@ -174,6 +202,7 @@ class AgentResponse:
|
||||
duration_ms: int = 0
|
||||
usage: dict | None = None
|
||||
error: str | None = None
|
||||
rate_limited: bool = False
|
||||
|
||||
@property
|
||||
def ok(self) -> bool:
|
||||
@@ -189,54 +218,220 @@ class SessionStore:
|
||||
"""
|
||||
Maps external IDs (e.g., Slack thread_ts) to OpenCode session IDs.
|
||||
Mirrors OpenClaw's session isolation: each channel thread gets its own session.
|
||||
|
||||
Backed by SQLite for persistence across restarts. Falls back to in-memory
|
||||
if the database cannot be opened.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._sessions: dict[str, dict] = {}
|
||||
def __init__(self, db_path: str | None = None):
|
||||
self._lock = threading.Lock()
|
||||
self._db_path = db_path or os.path.join(
|
||||
os.path.expanduser("~/.aetheel"), "sessions.db"
|
||||
)
|
||||
os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self) -> None:
|
||||
"""Initialize the sessions table."""
|
||||
with sqlite3.connect(self._db_path) as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS sessions (
|
||||
external_id TEXT PRIMARY KEY,
|
||||
session_id TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT '',
|
||||
created_at REAL NOT NULL,
|
||||
last_used REAL NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug(f"Session store initialized: {self._db_path}")
|
||||
|
||||
def _conn(self) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def get(self, external_id: str) -> str | None:
|
||||
"""Get the OpenCode session ID for an external conversation ID."""
|
||||
with self._lock:
|
||||
entry = self._sessions.get(external_id)
|
||||
if entry:
|
||||
entry["last_used"] = time.time()
|
||||
return entry["session_id"]
|
||||
with self._conn() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT session_id FROM sessions WHERE external_id = ?",
|
||||
(external_id,),
|
||||
).fetchone()
|
||||
if row:
|
||||
conn.execute(
|
||||
"UPDATE sessions SET last_used = ? WHERE external_id = ?",
|
||||
(time.time(), external_id),
|
||||
)
|
||||
conn.commit()
|
||||
return row["session_id"]
|
||||
return None
|
||||
|
||||
def set(self, external_id: str, session_id: str) -> None:
|
||||
def set(self, external_id: str, session_id: str, source: str = "") -> None:
|
||||
"""Map an external ID to an OpenCode session ID."""
|
||||
now = time.time()
|
||||
with self._lock:
|
||||
self._sessions[external_id] = {
|
||||
"session_id": session_id,
|
||||
"created": time.time(),
|
||||
"last_used": time.time(),
|
||||
}
|
||||
with self._conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO sessions (external_id, session_id, source, created_at, last_used)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(external_id) DO UPDATE SET
|
||||
session_id = excluded.session_id,
|
||||
last_used = excluded.last_used
|
||||
""",
|
||||
(external_id, session_id, source, now, now),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def remove(self, external_id: str) -> None:
|
||||
"""Remove -a session mapping."""
|
||||
"""Remove a session mapping."""
|
||||
with self._lock:
|
||||
self._sessions.pop(external_id, None)
|
||||
with self._conn() as conn:
|
||||
conn.execute(
|
||||
"DELETE FROM sessions WHERE external_id = ?",
|
||||
(external_id,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def cleanup(self, ttl_hours: int = 24) -> int:
|
||||
"""Remove stale sessions older than ttl_hours. Returns count removed."""
|
||||
cutoff = time.time() - (ttl_hours * 3600)
|
||||
removed = 0
|
||||
with self._lock:
|
||||
stale = [
|
||||
k
|
||||
for k, v in self._sessions.items()
|
||||
if v["last_used"] < cutoff
|
||||
]
|
||||
for k in stale:
|
||||
del self._sessions[k]
|
||||
removed += 1
|
||||
return removed
|
||||
with self._conn() as conn:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM sessions WHERE last_used < ?",
|
||||
(cutoff,),
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.rowcount
|
||||
|
||||
def list_all(self) -> list[dict]:
|
||||
"""List all active sessions (for diagnostics)."""
|
||||
with self._lock:
|
||||
with self._conn() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT external_id, session_id, source, created_at, last_used "
|
||||
"FROM sessions ORDER BY last_used DESC"
|
||||
).fetchall()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
with self._lock:
|
||||
return len(self._sessions)
|
||||
with self._conn() as conn:
|
||||
row = conn.execute("SELECT COUNT(*) as c FROM sessions").fetchone()
|
||||
return row["c"] if row else 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Live Session — IPC Message Streaming
|
||||
# (Mirrors nanoclaw's MessageStream + IPC polling pattern)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class LiveSession:
|
||||
"""
|
||||
A live, long-running agent session that accepts follow-up messages.
|
||||
|
||||
In CLI mode: holds a running `opencode run` subprocess. Follow-up
|
||||
messages are queued and sent as new subprocess invocations that
|
||||
--continue the same session.
|
||||
|
||||
In SDK mode: holds a session ID. Follow-up messages are sent via
|
||||
the SDK's session.prompt() to the same session.
|
||||
"""
|
||||
|
||||
conversation_id: str
|
||||
session_id: str | None = None
|
||||
created_at: float = field(default_factory=time.time)
|
||||
last_activity: float = field(default_factory=time.time)
|
||||
message_count: int = 0
|
||||
_lock: threading.Lock = field(default_factory=threading.Lock)
|
||||
|
||||
def touch(self) -> None:
|
||||
"""Update last activity timestamp."""
|
||||
self.last_activity = time.time()
|
||||
|
||||
@property
|
||||
def idle_seconds(self) -> float:
|
||||
return time.time() - self.last_activity
|
||||
|
||||
|
||||
class LiveSessionManager:
|
||||
"""
|
||||
Manages live sessions with idle timeout and cleanup.
|
||||
|
||||
This is the IPC streaming layer — it keeps sessions alive between
|
||||
messages so follow-up messages go to the same agent context, mirroring
|
||||
nanoclaw's container-based session loop.
|
||||
"""
|
||||
|
||||
def __init__(self, idle_timeout_seconds: int = 1800):
|
||||
self._sessions: dict[str, LiveSession] = {}
|
||||
self._lock = threading.Lock()
|
||||
self._idle_timeout = idle_timeout_seconds
|
||||
self._cleanup_thread: threading.Thread | None = None
|
||||
self._running = False
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start the background cleanup thread."""
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
self._cleanup_thread = threading.Thread(
|
||||
target=self._cleanup_loop, daemon=True, name="live-session-cleanup"
|
||||
)
|
||||
self._cleanup_thread.start()
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop the cleanup thread."""
|
||||
self._running = False
|
||||
|
||||
def get_or_create(self, conversation_id: str) -> LiveSession:
|
||||
"""Get an existing live session or create a new one."""
|
||||
with self._lock:
|
||||
session = self._sessions.get(conversation_id)
|
||||
if session:
|
||||
session.touch()
|
||||
return session
|
||||
session = LiveSession(conversation_id=conversation_id)
|
||||
self._sessions[conversation_id] = session
|
||||
logger.debug(f"Live session created: {conversation_id}")
|
||||
return session
|
||||
|
||||
def get(self, conversation_id: str) -> LiveSession | None:
|
||||
"""Get an existing live session (or None)."""
|
||||
with self._lock:
|
||||
return self._sessions.get(conversation_id)
|
||||
|
||||
def remove(self, conversation_id: str) -> None:
|
||||
"""Remove a live session."""
|
||||
with self._lock:
|
||||
self._sessions.pop(conversation_id, None)
|
||||
|
||||
def list_active(self) -> list[LiveSession]:
|
||||
"""List all active live sessions."""
|
||||
with self._lock:
|
||||
return list(self._sessions.values())
|
||||
|
||||
def _cleanup_loop(self) -> None:
|
||||
"""Periodically remove idle sessions."""
|
||||
while self._running:
|
||||
time.sleep(60)
|
||||
with self._lock:
|
||||
stale = [
|
||||
cid
|
||||
for cid, s in self._sessions.items()
|
||||
if s.idle_seconds > self._idle_timeout
|
||||
]
|
||||
for cid in stale:
|
||||
del self._sessions[cid]
|
||||
logger.info(f"Live session expired (idle): {cid}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -263,6 +458,10 @@ class OpenCodeRuntime:
|
||||
def __init__(self, config: OpenCodeConfig | None = None):
|
||||
self._config = config or OpenCodeConfig.from_env()
|
||||
self._sessions = SessionStore()
|
||||
self._live_sessions = LiveSessionManager(
|
||||
idle_timeout_seconds=self._config.session_ttl_hours * 3600
|
||||
)
|
||||
self._live_sessions.start()
|
||||
self._sdk_client = None
|
||||
self._sdk_available = False
|
||||
|
||||
@@ -293,6 +492,9 @@ class OpenCodeRuntime:
|
||||
Send a message to the AI agent and get a response.
|
||||
|
||||
This is the main entry point, used by the Slack adapter's message handler.
|
||||
If a live session exists for this conversation_id, the message is sent
|
||||
as a follow-up to the existing session (IPC streaming). Otherwise a
|
||||
new session is created.
|
||||
|
||||
Args:
|
||||
message: The user's message text
|
||||
@@ -311,6 +513,18 @@ class OpenCodeRuntime:
|
||||
)
|
||||
|
||||
try:
|
||||
# Check for an active live session — if one exists, this is a
|
||||
# follow-up message that should continue the same agent context
|
||||
if conversation_id:
|
||||
live = self._live_sessions.get(conversation_id)
|
||||
if live and live.session_id:
|
||||
logger.info(
|
||||
f"Follow-up message to live session "
|
||||
f"{conversation_id} (agent session={live.session_id[:8]}...)"
|
||||
)
|
||||
live.touch()
|
||||
live.message_count += 1
|
||||
|
||||
# Route to the appropriate mode
|
||||
if self._config.mode == RuntimeMode.SDK and self._sdk_available:
|
||||
result = self._chat_sdk(message, conversation_id, system_prompt)
|
||||
@@ -318,6 +532,14 @@ class OpenCodeRuntime:
|
||||
result = self._chat_cli(message, conversation_id, system_prompt)
|
||||
|
||||
result.duration_ms = int((time.time() - started) * 1000)
|
||||
|
||||
# Track the live session
|
||||
if conversation_id and result.session_id:
|
||||
live = self._live_sessions.get_or_create(conversation_id)
|
||||
live.session_id = result.session_id
|
||||
live.touch()
|
||||
live.message_count += 1
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
@@ -329,6 +551,71 @@ class OpenCodeRuntime:
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
def send_followup(
|
||||
self,
|
||||
message: str,
|
||||
conversation_id: str,
|
||||
system_prompt: str | None = None,
|
||||
) -> AgentResponse:
|
||||
"""
|
||||
Send a follow-up message to an active live session.
|
||||
|
||||
This is the IPC streaming entry point — it pipes a new message into
|
||||
an existing agent session, mirroring nanoclaw's MessageStream pattern
|
||||
where the host writes IPC files that get consumed by the running agent.
|
||||
|
||||
If no live session exists, falls back to a regular chat() call which
|
||||
will create a new session or resume the persisted one.
|
||||
|
||||
Args:
|
||||
message: The follow-up message text
|
||||
conversation_id: The conversation to send to
|
||||
system_prompt: Optional system prompt override
|
||||
|
||||
Returns:
|
||||
AgentResponse with the AI's reply
|
||||
"""
|
||||
live = self._live_sessions.get(conversation_id)
|
||||
if not live or not live.session_id:
|
||||
logger.debug(
|
||||
f"No live session for {conversation_id}, "
|
||||
f"falling back to chat()"
|
||||
)
|
||||
return self.chat(message, conversation_id, system_prompt)
|
||||
|
||||
logger.info(
|
||||
f"IPC follow-up: conversation={conversation_id}, "
|
||||
f"session={live.session_id[:8]}..., "
|
||||
f"msg_count={live.message_count + 1}"
|
||||
)
|
||||
live.touch()
|
||||
live.message_count += 1
|
||||
|
||||
# Route through the normal chat — the SessionStore already has the
|
||||
# mapping from conversation_id → opencode session_id, so the CLI
|
||||
# will use --continue --session, and the SDK will reuse the session.
|
||||
return self.chat(message, conversation_id, system_prompt)
|
||||
|
||||
def close_session(self, conversation_id: str) -> bool:
|
||||
"""
|
||||
Close a live session explicitly.
|
||||
|
||||
Mirrors nanoclaw's _close sentinel — signals that the session
|
||||
should end and resources should be freed.
|
||||
|
||||
Returns True if a session was closed.
|
||||
"""
|
||||
live = self._live_sessions.get(conversation_id)
|
||||
if live:
|
||||
self._live_sessions.remove(conversation_id)
|
||||
logger.info(
|
||||
f"Live session closed: {conversation_id} "
|
||||
f"(messages={live.message_count}, "
|
||||
f"alive={int(live.idle_seconds)}s)"
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Get the runtime status (for the /status command)."""
|
||||
status = {
|
||||
@@ -336,6 +623,7 @@ class OpenCodeRuntime:
|
||||
"model": self._config.model or "default",
|
||||
"provider": self._config.provider or "auto",
|
||||
"active_sessions": self._sessions.count,
|
||||
"live_sessions": len(self._live_sessions.list_active()),
|
||||
"opencode_available": self._is_opencode_available(),
|
||||
}
|
||||
|
||||
@@ -401,6 +689,7 @@ class OpenCodeRuntime:
|
||||
return AgentResponse(
|
||||
text="",
|
||||
error=f"OpenCode CLI error: {error_text[:500]}",
|
||||
rate_limited=_is_rate_limited(error_text),
|
||||
)
|
||||
|
||||
# Parse the output — mirrors OpenClaw's parseCliJson/parseCliJsonl
|
||||
@@ -842,6 +1131,24 @@ def build_aetheel_system_prompt(
|
||||
"When scheduling a reminder, confirm to the user that it's been set,",
|
||||
"and include the action tag in your response (it will be hidden from the user).",
|
||||
"",
|
||||
"# Your Tools",
|
||||
"- You have access to shell commands, file operations, and web search",
|
||||
"- Use web search to look up current information when needed",
|
||||
"- You can read and write files in the workspace (~/.aetheel/workspace/)",
|
||||
"- You can execute shell commands for system tasks",
|
||||
"",
|
||||
"# Self-Modification",
|
||||
"- You can edit your own config at ~/.aetheel/config.json",
|
||||
"- You can create new skills by writing SKILL.md files to ~/.aetheel/workspace/skills/<name>/SKILL.md",
|
||||
"- You can update your identity files (SOUL.md, USER.md, MEMORY.md)",
|
||||
"- You can modify HEARTBEAT.md to change your periodic tasks",
|
||||
"- After editing config, tell the user to restart or use /reload",
|
||||
"",
|
||||
"# Subagents & Teams",
|
||||
"- You can spawn background subagents for long-running tasks using [ACTION:spawn|<task>]",
|
||||
"- You can use Team tools (TeamCreate, SendMessage) for multi-agent coordination",
|
||||
"- Use /subagents to list active background tasks",
|
||||
"",
|
||||
"# Guidelines",
|
||||
"- Be helpful, concise, and friendly",
|
||||
"- Use Slack formatting (bold with *text*, code with `text`, etc.)",
|
||||
|
||||
Reference in New Issue
Block a user