feat: config-driven architecture, install wizard, live runtime switching, usage tracking, auto-failover

Major changes: - Config-driven adapters: all channels (Slack, Discord, Telegram, WebChat, Webhooks) controlled via config.json with enabled flags and token auto-detection, no CLI flags required - Runtime engine field: runtime.engine selects opencode/claude from config - Interactive install script: 8-phase setup wizard with AI runtime detection/installation, token setup, identity file personalization (personality presets), aetheel CLI command, background service (launchd/systemd) - Live runtime switching: /engine, /model, /provider commands hot-swap the AI runtime from chat without restart, changes persisted to config.json - Usage tracking: per-request cost extraction from Claude Code JSON output, cumulative stats via /usage command - Auto-failover: rate limit detection on both runtimes, automatic switch to other engine on quota errors with user notification - Chat commands work without / prefix (Slack intercepts / in channels), commands: engine, model, provider, config, usage, reload, cron, subagents, status, help - /config set for editing config.json from chat with dotted key notation - Security audit saved to docs/security-audit.md - Full command reference in docs/commands.md - Future changes doc with NanoClaw agent teams analysis - Logo added to README and WebChat UI - README fully rewritten with all features documented
2026-02-18 01:07:12 -05:00
parent 41b2f9a593
commit 6d73f74e0b
41 changed files with 11363 additions and 437 deletions
--- a/agent/opencode_runtime.py
+++ b/agent/opencode_runtime.py
@@ -32,7 +32,9 @@ Usage:
 import json
 import logging
 import os
+import queue
 import shutil
+import sqlite3
 import subprocess
 import threading
 import time
@@ -44,6 +46,32 @@ from typing import Any, Callable
 logger = logging.getLogger("aetheel.agent")


+# ---------------------------------------------------------------------------
+# Rate Limit Detection
+# ---------------------------------------------------------------------------
+
+_RATE_LIMIT_PATTERNS = [
+    "rate limit",
+    "rate_limit",
+    "too many requests",
+    "429",
+    "quota exceeded",
+    "usage limit",
+    "capacity",
+    "overloaded",
+    "credit balance",
+    "billing",
+    "exceeded your",
+    "max usage",
+]
+
+
+def _is_rate_limited(text: str) -> bool:
+    """Check if an error message indicates a rate limit or quota issue."""
+    lower = text.lower()
+    return any(pattern in lower for pattern in _RATE_LIMIT_PATTERNS)
+
+
 def _resolve_opencode_command(explicit: str | None = None) -> str:
    """
    Resolve the opencode binary path.
@@ -174,6 +202,7 @@ class AgentResponse:
    duration_ms: int = 0
    usage: dict | None = None
    error: str | None = None
+    rate_limited: bool = False

    @property
    def ok(self) -> bool:
@@ -189,54 +218,220 @@ class SessionStore:
    """
    Maps external IDs (e.g., Slack thread_ts) to OpenCode session IDs.
    Mirrors OpenClaw's session isolation: each channel thread gets its own session.
+
+    Backed by SQLite for persistence across restarts. Falls back to in-memory
+    if the database cannot be opened.
    """

-    def __init__(self):
-        self._sessions: dict[str, dict] = {}
+    def __init__(self, db_path: str | None = None):
        self._lock = threading.Lock()
+        self._db_path = db_path or os.path.join(
+            os.path.expanduser("~/.aetheel"), "sessions.db"
+        )
+        os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
+        self._init_db()
+
+    def _init_db(self) -> None:
+        """Initialize the sessions table."""
+        with sqlite3.connect(self._db_path) as conn:
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS sessions (
+                    external_id  TEXT PRIMARY KEY,
+                    session_id   TEXT NOT NULL,
+                    source       TEXT NOT NULL DEFAULT '',
+                    created_at   REAL NOT NULL,
+                    last_used    REAL NOT NULL
+                )
+                """
+            )
+            conn.commit()
+        logger.debug(f"Session store initialized: {self._db_path}")
+
+    def _conn(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self._db_path)
+        conn.row_factory = sqlite3.Row
+        return conn

    def get(self, external_id: str) -> str | None:
        """Get the OpenCode session ID for an external conversation ID."""
        with self._lock:
-            entry = self._sessions.get(external_id)
-            if entry:
-                entry["last_used"] = time.time()
-                return entry["session_id"]
+            with self._conn() as conn:
+                row = conn.execute(
+                    "SELECT session_id FROM sessions WHERE external_id = ?",
+                    (external_id,),
+                ).fetchone()
+                if row:
+                    conn.execute(
+                        "UPDATE sessions SET last_used = ? WHERE external_id = ?",
+                        (time.time(), external_id),
+                    )
+                    conn.commit()
+                    return row["session_id"]
            return None

-    def set(self, external_id: str, session_id: str) -> None:
+    def set(self, external_id: str, session_id: str, source: str = "") -> None:
        """Map an external ID to an OpenCode session ID."""
+        now = time.time()
        with self._lock:
-            self._sessions[external_id] = {
-                "session_id": session_id,
-                "created": time.time(),
-                "last_used": time.time(),
-            }
+            with self._conn() as conn:
+                conn.execute(
+                    """
+                    INSERT INTO sessions (external_id, session_id, source, created_at, last_used)
+                    VALUES (?, ?, ?, ?, ?)
+                    ON CONFLICT(external_id) DO UPDATE SET
+                        session_id = excluded.session_id,
+                        last_used  = excluded.last_used
+                    """,
+                    (external_id, session_id, source, now, now),
+                )
+                conn.commit()

    def remove(self, external_id: str) -> None:
-        """Remove -a session mapping."""
+        """Remove a session mapping."""
        with self._lock:
-            self._sessions.pop(external_id, None)
+            with self._conn() as conn:
+                conn.execute(
+                    "DELETE FROM sessions WHERE external_id = ?",
+                    (external_id,),
+                )
+                conn.commit()

    def cleanup(self, ttl_hours: int = 24) -> int:
        """Remove stale sessions older than ttl_hours. Returns count removed."""
        cutoff = time.time() - (ttl_hours * 3600)
-        removed = 0
        with self._lock:
-            stale = [
-                k
-                for k, v in self._sessions.items()
-                if v["last_used"] < cutoff
-            ]
-            for k in stale:
-                del self._sessions[k]
-                removed += 1
-        return removed
+            with self._conn() as conn:
+                cursor = conn.execute(
+                    "DELETE FROM sessions WHERE last_used < ?",
+                    (cutoff,),
+                )
+                conn.commit()
+                return cursor.rowcount
+
+    def list_all(self) -> list[dict]:
+        """List all active sessions (for diagnostics)."""
+        with self._lock:
+            with self._conn() as conn:
+                rows = conn.execute(
+                    "SELECT external_id, session_id, source, created_at, last_used "
+                    "FROM sessions ORDER BY last_used DESC"
+                ).fetchall()
+                return [dict(row) for row in rows]

    @property
    def count(self) -> int:
        with self._lock:
-            return len(self._sessions)
+            with self._conn() as conn:
+                row = conn.execute("SELECT COUNT(*) as c FROM sessions").fetchone()
+                return row["c"] if row else 0
+
+
+# ---------------------------------------------------------------------------
+# Live Session — IPC Message Streaming
+# (Mirrors nanoclaw's MessageStream + IPC polling pattern)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LiveSession:
+    """
+    A live, long-running agent session that accepts follow-up messages.
+
+    In CLI mode: holds a running `opencode run` subprocess. Follow-up
+    messages are queued and sent as new subprocess invocations that
+    --continue the same session.
+
+    In SDK mode: holds a session ID. Follow-up messages are sent via
+    the SDK's session.prompt() to the same session.
+    """
+
+    conversation_id: str
+    session_id: str | None = None
+    created_at: float = field(default_factory=time.time)
+    last_activity: float = field(default_factory=time.time)
+    message_count: int = 0
+    _lock: threading.Lock = field(default_factory=threading.Lock)
+
+    def touch(self) -> None:
+        """Update last activity timestamp."""
+        self.last_activity = time.time()
+
+    @property
+    def idle_seconds(self) -> float:
+        return time.time() - self.last_activity
+
+
+class LiveSessionManager:
+    """
+    Manages live sessions with idle timeout and cleanup.
+
+    This is the IPC streaming layer — it keeps sessions alive between
+    messages so follow-up messages go to the same agent context, mirroring
+    nanoclaw's container-based session loop.
+    """
+
+    def __init__(self, idle_timeout_seconds: int = 1800):
+        self._sessions: dict[str, LiveSession] = {}
+        self._lock = threading.Lock()
+        self._idle_timeout = idle_timeout_seconds
+        self._cleanup_thread: threading.Thread | None = None
+        self._running = False
+
+    def start(self) -> None:
+        """Start the background cleanup thread."""
+        if self._running:
+            return
+        self._running = True
+        self._cleanup_thread = threading.Thread(
+            target=self._cleanup_loop, daemon=True, name="live-session-cleanup"
+        )
+        self._cleanup_thread.start()
+
+    def stop(self) -> None:
+        """Stop the cleanup thread."""
+        self._running = False
+
+    def get_or_create(self, conversation_id: str) -> LiveSession:
+        """Get an existing live session or create a new one."""
+        with self._lock:
+            session = self._sessions.get(conversation_id)
+            if session:
+                session.touch()
+                return session
+            session = LiveSession(conversation_id=conversation_id)
+            self._sessions[conversation_id] = session
+            logger.debug(f"Live session created: {conversation_id}")
+            return session
+
+    def get(self, conversation_id: str) -> LiveSession | None:
+        """Get an existing live session (or None)."""
+        with self._lock:
+            return self._sessions.get(conversation_id)
+
+    def remove(self, conversation_id: str) -> None:
+        """Remove a live session."""
+        with self._lock:
+            self._sessions.pop(conversation_id, None)
+
+    def list_active(self) -> list[LiveSession]:
+        """List all active live sessions."""
+        with self._lock:
+            return list(self._sessions.values())
+
+    def _cleanup_loop(self) -> None:
+        """Periodically remove idle sessions."""
+        while self._running:
+            time.sleep(60)
+            with self._lock:
+                stale = [
+                    cid
+                    for cid, s in self._sessions.items()
+                    if s.idle_seconds > self._idle_timeout
+                ]
+                for cid in stale:
+                    del self._sessions[cid]
+                    logger.info(f"Live session expired (idle): {cid}")


 # ---------------------------------------------------------------------------
@@ -263,6 +458,10 @@ class OpenCodeRuntime:
    def __init__(self, config: OpenCodeConfig | None = None):
        self._config = config or OpenCodeConfig.from_env()
        self._sessions = SessionStore()
+        self._live_sessions = LiveSessionManager(
+            idle_timeout_seconds=self._config.session_ttl_hours * 3600
+        )
+        self._live_sessions.start()
        self._sdk_client = None
        self._sdk_available = False

@@ -293,6 +492,9 @@ class OpenCodeRuntime:
        Send a message to the AI agent and get a response.

        This is the main entry point, used by the Slack adapter's message handler.
+        If a live session exists for this conversation_id, the message is sent
+        as a follow-up to the existing session (IPC streaming). Otherwise a
+        new session is created.

        Args:
            message: The user's message text
@@ -311,6 +513,18 @@ class OpenCodeRuntime:
            )

        try:
+            # Check for an active live session — if one exists, this is a
+            # follow-up message that should continue the same agent context
+            if conversation_id:
+                live = self._live_sessions.get(conversation_id)
+                if live and live.session_id:
+                    logger.info(
+                        f"Follow-up message to live session "
+                        f"{conversation_id} (agent session={live.session_id[:8]}...)"
+                    )
+                    live.touch()
+                    live.message_count += 1
+
            # Route to the appropriate mode
            if self._config.mode == RuntimeMode.SDK and self._sdk_available:
                result = self._chat_sdk(message, conversation_id, system_prompt)
@@ -318,6 +532,14 @@ class OpenCodeRuntime:
                result = self._chat_cli(message, conversation_id, system_prompt)

            result.duration_ms = int((time.time() - started) * 1000)
+
+            # Track the live session
+            if conversation_id and result.session_id:
+                live = self._live_sessions.get_or_create(conversation_id)
+                live.session_id = result.session_id
+                live.touch()
+                live.message_count += 1
+
            return result

        except Exception as e:
@@ -329,6 +551,71 @@ class OpenCodeRuntime:
                duration_ms=duration_ms,
            )

+    def send_followup(
+        self,
+        message: str,
+        conversation_id: str,
+        system_prompt: str | None = None,
+    ) -> AgentResponse:
+        """
+        Send a follow-up message to an active live session.
+
+        This is the IPC streaming entry point — it pipes a new message into
+        an existing agent session, mirroring nanoclaw's MessageStream pattern
+        where the host writes IPC files that get consumed by the running agent.
+
+        If no live session exists, falls back to a regular chat() call which
+        will create a new session or resume the persisted one.
+
+        Args:
+            message: The follow-up message text
+            conversation_id: The conversation to send to
+            system_prompt: Optional system prompt override
+
+        Returns:
+            AgentResponse with the AI's reply
+        """
+        live = self._live_sessions.get(conversation_id)
+        if not live or not live.session_id:
+            logger.debug(
+                f"No live session for {conversation_id}, "
+                f"falling back to chat()"
+            )
+            return self.chat(message, conversation_id, system_prompt)
+
+        logger.info(
+            f"IPC follow-up: conversation={conversation_id}, "
+            f"session={live.session_id[:8]}..., "
+            f"msg_count={live.message_count + 1}"
+        )
+        live.touch()
+        live.message_count += 1
+
+        # Route through the normal chat — the SessionStore already has the
+        # mapping from conversation_id → opencode session_id, so the CLI
+        # will use --continue --session, and the SDK will reuse the session.
+        return self.chat(message, conversation_id, system_prompt)
+
+    def close_session(self, conversation_id: str) -> bool:
+        """
+        Close a live session explicitly.
+
+        Mirrors nanoclaw's _close sentinel — signals that the session
+        should end and resources should be freed.
+
+        Returns True if a session was closed.
+        """
+        live = self._live_sessions.get(conversation_id)
+        if live:
+            self._live_sessions.remove(conversation_id)
+            logger.info(
+                f"Live session closed: {conversation_id} "
+                f"(messages={live.message_count}, "
+                f"alive={int(live.idle_seconds)}s)"
+            )
+            return True
+        return False
+
    def get_status(self) -> dict:
        """Get the runtime status (for the /status command)."""
        status = {
@@ -336,6 +623,7 @@ class OpenCodeRuntime:
            "model": self._config.model or "default",
            "provider": self._config.provider or "auto",
            "active_sessions": self._sessions.count,
+            "live_sessions": len(self._live_sessions.list_active()),
            "opencode_available": self._is_opencode_available(),
        }

@@ -401,6 +689,7 @@ class OpenCodeRuntime:
                return AgentResponse(
                    text="",
                    error=f"OpenCode CLI error: {error_text[:500]}",
+                    rate_limited=_is_rate_limited(error_text),
                )

            # Parse the output — mirrors OpenClaw's parseCliJson/parseCliJsonl
@@ -842,6 +1131,24 @@ def build_aetheel_system_prompt(
        "When scheduling a reminder, confirm to the user that it's been set,",
        "and include the action tag in your response (it will be hidden from the user).",
        "",
+        "# Your Tools",
+        "- You have access to shell commands, file operations, and web search",
+        "- Use web search to look up current information when needed",
+        "- You can read and write files in the workspace (~/.aetheel/workspace/)",
+        "- You can execute shell commands for system tasks",
+        "",
+        "# Self-Modification",
+        "- You can edit your own config at ~/.aetheel/config.json",
+        "- You can create new skills by writing SKILL.md files to ~/.aetheel/workspace/skills/<name>/SKILL.md",
+        "- You can update your identity files (SOUL.md, USER.md, MEMORY.md)",
+        "- You can modify HEARTBEAT.md to change your periodic tasks",
+        "- After editing config, tell the user to restart or use /reload",
+        "",
+        "# Subagents & Teams",
+        "- You can spawn background subagents for long-running tasks using [ACTION:spawn|<task>]",
+        "- You can use Team tools (TeamCreate, SendMessage) for multi-agent coordination",
+        "- Use /subagents to list active background tasks",
+        "",
        "# Guidelines",
        "- Be helpful, concise, and friendly",
        "- Use Slack formatting (bold with *text*, code with `text`, etc.)",