feat: config-driven architecture, install wizard, live runtime switching, usage tracking, auto-failover

Major changes:
- Config-driven adapters: all channels (Slack, Discord, Telegram, WebChat, Webhooks) controlled via config.json with enabled flags and token auto-detection, no CLI flags required
- Runtime engine field: runtime.engine selects opencode/claude from config
- Interactive install script: 8-phase setup wizard with AI runtime detection/installation, token setup, identity file personalization (personality presets), aetheel CLI command, background service (launchd/systemd)
- Live runtime switching: /engine, /model, /provider commands hot-swap the AI runtime from chat without restart, changes persisted to config.json
- Usage tracking: per-request cost extraction from Claude Code JSON output, cumulative stats via /usage command
- Auto-failover: rate limit detection on both runtimes, automatic switch to other engine on quota errors with user notification
- Chat commands work without / prefix (Slack intercepts / in channels), commands: engine, model, provider, config, usage, reload, cron, subagents, status, help
- /config set for editing config.json from chat with dotted key notation
- Security audit saved to docs/security-audit.md
- Full command reference in docs/commands.md
- Future changes doc with NanoClaw agent teams analysis
- Logo added to README and WebChat UI
- README fully rewritten with all features documented
This commit is contained in:
2026-02-18 01:07:12 -05:00
parent 41b2f9a593
commit 6d73f74e0b
41 changed files with 11363 additions and 437 deletions

View File

@@ -32,7 +32,9 @@ Usage:
import json
import logging
import os
import queue
import shutil
import sqlite3
import subprocess
import threading
import time
@@ -44,6 +46,32 @@ from typing import Any, Callable
logger = logging.getLogger("aetheel.agent")
# ---------------------------------------------------------------------------
# Rate Limit Detection
# ---------------------------------------------------------------------------
_RATE_LIMIT_PATTERNS = [
"rate limit",
"rate_limit",
"too many requests",
"429",
"quota exceeded",
"usage limit",
"capacity",
"overloaded",
"credit balance",
"billing",
"exceeded your",
"max usage",
]
def _is_rate_limited(text: str) -> bool:
"""Check if an error message indicates a rate limit or quota issue."""
lower = text.lower()
return any(pattern in lower for pattern in _RATE_LIMIT_PATTERNS)
def _resolve_opencode_command(explicit: str | None = None) -> str:
"""
Resolve the opencode binary path.
@@ -174,6 +202,7 @@ class AgentResponse:
duration_ms: int = 0
usage: dict | None = None
error: str | None = None
rate_limited: bool = False
@property
def ok(self) -> bool:
@@ -189,54 +218,220 @@ class SessionStore:
"""
Maps external IDs (e.g., Slack thread_ts) to OpenCode session IDs.
Mirrors OpenClaw's session isolation: each channel thread gets its own session.
Backed by SQLite for persistence across restarts. Falls back to in-memory
if the database cannot be opened.
"""
def __init__(self):
self._sessions: dict[str, dict] = {}
def __init__(self, db_path: str | None = None):
self._lock = threading.Lock()
self._db_path = db_path or os.path.join(
os.path.expanduser("~/.aetheel"), "sessions.db"
)
os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
self._init_db()
def _init_db(self) -> None:
"""Initialize the sessions table."""
with sqlite3.connect(self._db_path) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS sessions (
external_id TEXT PRIMARY KEY,
session_id TEXT NOT NULL,
source TEXT NOT NULL DEFAULT '',
created_at REAL NOT NULL,
last_used REAL NOT NULL
)
"""
)
conn.commit()
logger.debug(f"Session store initialized: {self._db_path}")
def _conn(self) -> sqlite3.Connection:
conn = sqlite3.connect(self._db_path)
conn.row_factory = sqlite3.Row
return conn
def get(self, external_id: str) -> str | None:
"""Get the OpenCode session ID for an external conversation ID."""
with self._lock:
entry = self._sessions.get(external_id)
if entry:
entry["last_used"] = time.time()
return entry["session_id"]
with self._conn() as conn:
row = conn.execute(
"SELECT session_id FROM sessions WHERE external_id = ?",
(external_id,),
).fetchone()
if row:
conn.execute(
"UPDATE sessions SET last_used = ? WHERE external_id = ?",
(time.time(), external_id),
)
conn.commit()
return row["session_id"]
return None
def set(self, external_id: str, session_id: str) -> None:
def set(self, external_id: str, session_id: str, source: str = "") -> None:
"""Map an external ID to an OpenCode session ID."""
now = time.time()
with self._lock:
self._sessions[external_id] = {
"session_id": session_id,
"created": time.time(),
"last_used": time.time(),
}
with self._conn() as conn:
conn.execute(
"""
INSERT INTO sessions (external_id, session_id, source, created_at, last_used)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(external_id) DO UPDATE SET
session_id = excluded.session_id,
last_used = excluded.last_used
""",
(external_id, session_id, source, now, now),
)
conn.commit()
def remove(self, external_id: str) -> None:
"""Remove -a session mapping."""
"""Remove a session mapping."""
with self._lock:
self._sessions.pop(external_id, None)
with self._conn() as conn:
conn.execute(
"DELETE FROM sessions WHERE external_id = ?",
(external_id,),
)
conn.commit()
def cleanup(self, ttl_hours: int = 24) -> int:
"""Remove stale sessions older than ttl_hours. Returns count removed."""
cutoff = time.time() - (ttl_hours * 3600)
removed = 0
with self._lock:
stale = [
k
for k, v in self._sessions.items()
if v["last_used"] < cutoff
]
for k in stale:
del self._sessions[k]
removed += 1
return removed
with self._conn() as conn:
cursor = conn.execute(
"DELETE FROM sessions WHERE last_used < ?",
(cutoff,),
)
conn.commit()
return cursor.rowcount
def list_all(self) -> list[dict]:
"""List all active sessions (for diagnostics)."""
with self._lock:
with self._conn() as conn:
rows = conn.execute(
"SELECT external_id, session_id, source, created_at, last_used "
"FROM sessions ORDER BY last_used DESC"
).fetchall()
return [dict(row) for row in rows]
@property
def count(self) -> int:
with self._lock:
return len(self._sessions)
with self._conn() as conn:
row = conn.execute("SELECT COUNT(*) as c FROM sessions").fetchone()
return row["c"] if row else 0
# ---------------------------------------------------------------------------
# Live Session — IPC Message Streaming
# (Mirrors nanoclaw's MessageStream + IPC polling pattern)
# ---------------------------------------------------------------------------
@dataclass
class LiveSession:
"""
A live, long-running agent session that accepts follow-up messages.
In CLI mode: holds a running `opencode run` subprocess. Follow-up
messages are queued and sent as new subprocess invocations that
--continue the same session.
In SDK mode: holds a session ID. Follow-up messages are sent via
the SDK's session.prompt() to the same session.
"""
conversation_id: str
session_id: str | None = None
created_at: float = field(default_factory=time.time)
last_activity: float = field(default_factory=time.time)
message_count: int = 0
_lock: threading.Lock = field(default_factory=threading.Lock)
def touch(self) -> None:
"""Update last activity timestamp."""
self.last_activity = time.time()
@property
def idle_seconds(self) -> float:
return time.time() - self.last_activity
class LiveSessionManager:
"""
Manages live sessions with idle timeout and cleanup.
This is the IPC streaming layer — it keeps sessions alive between
messages so follow-up messages go to the same agent context, mirroring
nanoclaw's container-based session loop.
"""
def __init__(self, idle_timeout_seconds: int = 1800):
self._sessions: dict[str, LiveSession] = {}
self._lock = threading.Lock()
self._idle_timeout = idle_timeout_seconds
self._cleanup_thread: threading.Thread | None = None
self._running = False
def start(self) -> None:
"""Start the background cleanup thread."""
if self._running:
return
self._running = True
self._cleanup_thread = threading.Thread(
target=self._cleanup_loop, daemon=True, name="live-session-cleanup"
)
self._cleanup_thread.start()
def stop(self) -> None:
"""Stop the cleanup thread."""
self._running = False
def get_or_create(self, conversation_id: str) -> LiveSession:
"""Get an existing live session or create a new one."""
with self._lock:
session = self._sessions.get(conversation_id)
if session:
session.touch()
return session
session = LiveSession(conversation_id=conversation_id)
self._sessions[conversation_id] = session
logger.debug(f"Live session created: {conversation_id}")
return session
def get(self, conversation_id: str) -> LiveSession | None:
"""Get an existing live session (or None)."""
with self._lock:
return self._sessions.get(conversation_id)
def remove(self, conversation_id: str) -> None:
"""Remove a live session."""
with self._lock:
self._sessions.pop(conversation_id, None)
def list_active(self) -> list[LiveSession]:
"""List all active live sessions."""
with self._lock:
return list(self._sessions.values())
def _cleanup_loop(self) -> None:
"""Periodically remove idle sessions."""
while self._running:
time.sleep(60)
with self._lock:
stale = [
cid
for cid, s in self._sessions.items()
if s.idle_seconds > self._idle_timeout
]
for cid in stale:
del self._sessions[cid]
logger.info(f"Live session expired (idle): {cid}")
# ---------------------------------------------------------------------------
@@ -263,6 +458,10 @@ class OpenCodeRuntime:
def __init__(self, config: OpenCodeConfig | None = None):
self._config = config or OpenCodeConfig.from_env()
self._sessions = SessionStore()
self._live_sessions = LiveSessionManager(
idle_timeout_seconds=self._config.session_ttl_hours * 3600
)
self._live_sessions.start()
self._sdk_client = None
self._sdk_available = False
@@ -293,6 +492,9 @@ class OpenCodeRuntime:
Send a message to the AI agent and get a response.
This is the main entry point, used by the Slack adapter's message handler.
If a live session exists for this conversation_id, the message is sent
as a follow-up to the existing session (IPC streaming). Otherwise a
new session is created.
Args:
message: The user's message text
@@ -311,6 +513,18 @@ class OpenCodeRuntime:
)
try:
# Check for an active live session — if one exists, this is a
# follow-up message that should continue the same agent context
if conversation_id:
live = self._live_sessions.get(conversation_id)
if live and live.session_id:
logger.info(
f"Follow-up message to live session "
f"{conversation_id} (agent session={live.session_id[:8]}...)"
)
live.touch()
live.message_count += 1
# Route to the appropriate mode
if self._config.mode == RuntimeMode.SDK and self._sdk_available:
result = self._chat_sdk(message, conversation_id, system_prompt)
@@ -318,6 +532,14 @@ class OpenCodeRuntime:
result = self._chat_cli(message, conversation_id, system_prompt)
result.duration_ms = int((time.time() - started) * 1000)
# Track the live session
if conversation_id and result.session_id:
live = self._live_sessions.get_or_create(conversation_id)
live.session_id = result.session_id
live.touch()
live.message_count += 1
return result
except Exception as e:
@@ -329,6 +551,71 @@ class OpenCodeRuntime:
duration_ms=duration_ms,
)
def send_followup(
self,
message: str,
conversation_id: str,
system_prompt: str | None = None,
) -> AgentResponse:
"""
Send a follow-up message to an active live session.
This is the IPC streaming entry point — it pipes a new message into
an existing agent session, mirroring nanoclaw's MessageStream pattern
where the host writes IPC files that get consumed by the running agent.
If no live session exists, falls back to a regular chat() call which
will create a new session or resume the persisted one.
Args:
message: The follow-up message text
conversation_id: The conversation to send to
system_prompt: Optional system prompt override
Returns:
AgentResponse with the AI's reply
"""
live = self._live_sessions.get(conversation_id)
if not live or not live.session_id:
logger.debug(
f"No live session for {conversation_id}, "
f"falling back to chat()"
)
return self.chat(message, conversation_id, system_prompt)
logger.info(
f"IPC follow-up: conversation={conversation_id}, "
f"session={live.session_id[:8]}..., "
f"msg_count={live.message_count + 1}"
)
live.touch()
live.message_count += 1
# Route through the normal chat — the SessionStore already has the
# mapping from conversation_id → opencode session_id, so the CLI
# will use --continue --session, and the SDK will reuse the session.
return self.chat(message, conversation_id, system_prompt)
def close_session(self, conversation_id: str) -> bool:
"""
Close a live session explicitly.
Mirrors nanoclaw's _close sentinel — signals that the session
should end and resources should be freed.
Returns True if a session was closed.
"""
live = self._live_sessions.get(conversation_id)
if live:
self._live_sessions.remove(conversation_id)
logger.info(
f"Live session closed: {conversation_id} "
f"(messages={live.message_count}, "
f"alive={int(live.idle_seconds)}s)"
)
return True
return False
def get_status(self) -> dict:
"""Get the runtime status (for the /status command)."""
status = {
@@ -336,6 +623,7 @@ class OpenCodeRuntime:
"model": self._config.model or "default",
"provider": self._config.provider or "auto",
"active_sessions": self._sessions.count,
"live_sessions": len(self._live_sessions.list_active()),
"opencode_available": self._is_opencode_available(),
}
@@ -401,6 +689,7 @@ class OpenCodeRuntime:
return AgentResponse(
text="",
error=f"OpenCode CLI error: {error_text[:500]}",
rate_limited=_is_rate_limited(error_text),
)
# Parse the output — mirrors OpenClaw's parseCliJson/parseCliJsonl
@@ -842,6 +1131,24 @@ def build_aetheel_system_prompt(
"When scheduling a reminder, confirm to the user that it's been set,",
"and include the action tag in your response (it will be hidden from the user).",
"",
"# Your Tools",
"- You have access to shell commands, file operations, and web search",
"- Use web search to look up current information when needed",
"- You can read and write files in the workspace (~/.aetheel/workspace/)",
"- You can execute shell commands for system tasks",
"",
"# Self-Modification",
"- You can edit your own config at ~/.aetheel/config.json",
"- You can create new skills by writing SKILL.md files to ~/.aetheel/workspace/skills/<name>/SKILL.md",
"- You can update your identity files (SOUL.md, USER.md, MEMORY.md)",
"- You can modify HEARTBEAT.md to change your periodic tasks",
"- After editing config, tell the user to restart or use /reload",
"",
"# Subagents & Teams",
"- You can spawn background subagents for long-running tasks using [ACTION:spawn|<task>]",
"- You can use Team tools (TeamCreate, SendMessage) for multi-agent coordination",
"- Use /subagents to list active background tasks",
"",
"# Guidelines",
"- Be helpful, concise, and friendly",
"- Use Slack formatting (bold with *text*, code with `text`, etc.)",