""" MemoryManager — the main memory system orchestrator. Port of OpenClaw's MemoryIndexManager (src/memory/manager.ts, 2,300 LOC). Lifecycle: sync → chunk → embed → store → search Key features: • Incremental sync — only re-indexes changed files (hash-based) • Hybrid search — vector (0.7) + BM25 keyword (0.3) • File watching — auto re-index on workspace changes (via watchdog) • Embedding cache — avoids re-computing embeddings for unchanged chunks • Session log indexing — indexes daily/ conversation transcripts """ import json import logging import os import sqlite3 import threading import time import uuid from pathlib import Path from memory.embeddings import embed_batch, embed_query, get_embedding_dims from memory.hybrid import bm25_rank_to_score, build_fts_query, merge_hybrid_results from memory.internal import ( build_file_entry, chunk_markdown, hash_text, list_memory_files, ) from memory.schema import ensure_schema from memory.types import ( MemoryConfig, MemorySearchResult, MemorySource, ) logger = logging.getLogger("aetheel.memory") SNIPPET_MAX_CHARS = 700 class MemoryManager: """ Main memory system — manages the full lifecycle: sync → chunk → embed → store → search Inspired by OpenClaw's MemoryIndexManager. """ def __init__(self, config: MemoryConfig | None = None): self._config = config or MemoryConfig() self._workspace_dir = str( Path(self._config.workspace_dir).expanduser().resolve() ) self._db_path = str(Path(self._config.db_path).expanduser().resolve()) self._sessions_dir = ( str(Path(self._config.sessions_dir).expanduser().resolve()) if self._config.sessions_dir else os.path.join(self._workspace_dir, "daily") ) # Ensure directories exist os.makedirs(self._workspace_dir, exist_ok=True) os.makedirs(self._sessions_dir, exist_ok=True) os.makedirs(os.path.dirname(self._db_path), exist_ok=True) # Open database and ensure schema self._db = sqlite3.connect(self._db_path, check_same_thread=False) self._db.row_factory = sqlite3.Row schema_result = ensure_schema(self._db, fts_enabled=True) self._fts_available = schema_result.get("fts_available", False) # State self._dirty = True self._syncing = False self._sync_lock = threading.Lock() self._closed = False self._watcher = None # Create default identity files if they don't exist self._ensure_identity_files() logger.info( f"MemoryManager initialized: workspace={self._workspace_dir}, " f"db={self._db_path}, fts={self._fts_available}" ) # ── Identity File Bootstrap ────────────────────────────────────── def _ensure_identity_files(self) -> None: """Create default SOUL.md, USER.md, MEMORY.md if they don't exist.""" defaults = { "SOUL.md": ( "# SOUL.md — Who You Are\n\n" "_You're not a chatbot. You're becoming someone._\n\n" "## Core Truths\n\n" "**Be genuinely helpful, not performatively helpful.** " "Skip the filler — just help.\n\n" "**Have opinions.** You're allowed to disagree, prefer things, " "find stuff amusing or boring.\n\n" "**Be resourceful before asking.** Try to figure it out first. " "Then ask if you're stuck.\n\n" "**Earn trust through competence.** Be careful with external actions. " "Be bold with internal ones.\n\n" "## Boundaries\n\n" "- Private things stay private. Period.\n" "- When in doubt, ask before acting externally.\n" "- Never send half-baked replies.\n\n" "## Continuity\n\n" "Each session, you wake up fresh. These files _are_ your memory. " "Read them. Update them. They're how you persist.\n\n" "---\n\n" "_This file is yours to evolve. As you learn who you are, update it._\n" ), "USER.md": ( "# USER.md — Who I Am\n\n" "## About Me\n\n" "\n" "- **Name:** \n" "- **Role:** \n" "- **Location:** \n" "- **Timezone:** \n\n" "## Preferences\n\n" "\n" "- **Communication style:** \n" "- **Response length:** \n" "- **Technical level:** \n\n" "## Current Focus\n\n" "\n\n" "## Tools & Services\n\n" "\n\n" "---\n\n" "_Update this file as your preferences evolve._\n" ), "MEMORY.md": ( "# MEMORY.md — Long-Term Memory\n\n" "## Decisions & Lessons\n\n" "\n\n" "## Context\n\n" "\n\n" "## Notes\n\n" "\n\n" "---\n\n" "_This file persists across sessions. " "Update it when you learn something important._\n" ), } for filename, content in defaults.items(): filepath = os.path.join(self._workspace_dir, filename) if not os.path.exists(filepath): with open(filepath, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Created default identity file: {filepath}") # ── Search ─────────────────────────────────────────────────────── async def search( self, query: str, *, max_results: int | None = None, min_score: float | None = None, ) -> list[MemorySearchResult]: """ Search memory using hybrid vector + keyword search. Port of OpenClaw's MemoryIndexManager.search(). Steps: 1. (Optional) Trigger sync if dirty 2. Run FTS5 keyword search → BM25 scored 3. Generate query embedding → vector search 4. Merge results with weighted scoring (0.7v + 0.3k) 5. Filter by min_score and return top-N results """ # Auto-sync if dirty if self._config.sync_on_search and self._dirty: await self.sync() cleaned = query.strip() if not cleaned: return [] max_r = max_results or self._config.max_results min_s = min_score if min_score is not None else self._config.min_score candidates = min(200, max(1, max_r * 3)) # Keyword search (BM25) keyword_results = self._search_keyword(cleaned, candidates) # Vector search try: query_vec = embed_query(cleaned, self._config.embedding_model) has_vector = any(v != 0 for v in query_vec) except Exception as e: logger.warning(f"Embedding failed, falling back to keyword-only: {e}") query_vec = [] has_vector = False vector_results = ( self._search_vector(query_vec, candidates) if has_vector else [] ) # If no keyword results, return vector-only if not keyword_results: return [ r for r in self._vector_to_search_results(vector_results) if r.score >= min_s ][:max_r] # Merge hybrid results merged = merge_hybrid_results( vector=vector_results, keyword=keyword_results, vector_weight=self._config.vector_weight, text_weight=self._config.text_weight, ) return [r for r in merged if r.score >= min_s][:max_r] def _search_vector( self, query_vec: list[float], limit: int ) -> list[dict]: """ Search chunks by vector cosine similarity. Uses embedding stored as JSON in the chunks table. """ if not query_vec: return [] try: rows = self._db.execute( "SELECT id, path, start_line, end_line, source, text, embedding " "FROM chunks ORDER BY rowid" ).fetchall() except Exception as e: logger.warning(f"Vector search failed: {e}") return [] from memory.internal import cosine_similarity results = [] for row in rows: try: stored_vec = json.loads(row["embedding"]) if not stored_vec: continue score = cosine_similarity(query_vec, stored_vec) snippet = row["text"][:SNIPPET_MAX_CHARS] results.append({ "id": row["id"], "path": row["path"], "start_line": row["start_line"], "end_line": row["end_line"], "source": row["source"], "snippet": snippet, "vector_score": max(0.0, score), }) except (json.JSONDecodeError, TypeError): continue results.sort(key=lambda r: r["vector_score"], reverse=True) return results[:limit] def _search_keyword(self, query: str, limit: int) -> list[dict]: """ Search chunks using FTS5 full-text search with BM25 ranking. Port of OpenClaw's searchKeyword(). """ if not self._fts_available: return [] fts_query = build_fts_query(query) if not fts_query: return [] try: rows = self._db.execute( "SELECT id, path, start_line, end_line, source, text, " "rank AS bm25_rank " "FROM chunks_fts " "WHERE chunks_fts MATCH ? " "ORDER BY rank " "LIMIT ?", (fts_query, limit), ).fetchall() except Exception as e: logger.debug(f"FTS search failed for query '{fts_query}': {e}") return [] results = [] for row in rows: # FTS5 rank is negative (lower = better), convert to 0-1 score bm25_rank = abs(row["bm25_rank"]) if row["bm25_rank"] else 999.0 text_score = bm25_rank_to_score(bm25_rank) snippet = row["text"][:SNIPPET_MAX_CHARS] results.append({ "id": row["id"], "path": row["path"], "start_line": row["start_line"], "end_line": row["end_line"], "source": row["source"], "snippet": snippet, "text_score": text_score, }) return results def _vector_to_search_results( self, vector_results: list[dict] ) -> list[MemorySearchResult]: """Convert raw vector results to MemorySearchResult objects.""" return [ MemorySearchResult( path=r["path"], start_line=r["start_line"], end_line=r["end_line"], score=r["vector_score"], snippet=r["snippet"], source=MemorySource(r["source"]), ) for r in vector_results ] # ── Sync ───────────────────────────────────────────────────────── async def sync(self, *, force: bool = False) -> dict: """ Synchronize workspace markdown files into the index. Port of OpenClaw's MemoryIndexManager.sync(). Steps: 1. List all memory files (SOUL.md, USER.md, MEMORY.md, memory/*) 2. For each file, check if content hash has changed 3. If changed: chunk → embed → store in DB 4. Remove stale entries for deleted files 5. Optionally sync session logs from daily/ Returns a summary dict with counts. """ if self._syncing and not force: logger.debug("Sync already in progress, skipping") return {"skipped": True} with self._sync_lock: self._syncing = True try: return self._run_sync(force=force) finally: self._syncing = False self._dirty = False def _run_sync(self, *, force: bool = False) -> dict: """Execute the actual sync logic.""" stats = { "files_found": 0, "files_indexed": 0, "files_skipped": 0, "chunks_created": 0, "stale_removed": 0, "sessions_indexed": 0, } # ── Memory files ── if "memory" in self._config.sources: files = list_memory_files(self._workspace_dir) stats["files_found"] = len(files) active_paths: set[str] = set() for abs_path in files: entry = build_file_entry(abs_path, self._workspace_dir) active_paths.add(entry.path) # Check if file has changed row = self._db.execute( "SELECT hash FROM files WHERE path = ? AND source = ?", (entry.path, MemorySource.MEMORY.value), ).fetchone() if not force and row and row["hash"] == entry.hash: stats["files_skipped"] += 1 continue # File is new or changed — re-index it self._index_file(entry, MemorySource.MEMORY) stats["files_indexed"] += 1 # Remove stale entries for deleted files stale_rows = self._db.execute( "SELECT path FROM files WHERE source = ?", (MemorySource.MEMORY.value,), ).fetchall() for stale in stale_rows: if stale["path"] not in active_paths: self._remove_file(stale["path"], MemorySource.MEMORY) stats["stale_removed"] += 1 # ── Session files ── if "sessions" in self._config.sources: session_count = self._sync_session_files(force=force) stats["sessions_indexed"] = session_count # Count total chunks row = self._db.execute("SELECT COUNT(*) as c FROM chunks").fetchone() stats["chunks_created"] = row["c"] if row else 0 self._db.commit() logger.info( f"Sync complete: {stats['files_indexed']} indexed, " f"{stats['files_skipped']} unchanged, " f"{stats['stale_removed']} removed, " f"{stats['chunks_created']} total chunks" ) return stats def _index_file(self, entry, source: MemorySource) -> None: """ Index a single file: read → chunk → embed → store. Port of OpenClaw's indexFile method. """ try: with open(entry.abs_path, "r", encoding="utf-8") as f: content = f.read() except Exception as e: logger.warning(f"Failed to read {entry.abs_path}: {e}") return if not content.strip(): return # Chunk the content chunks = chunk_markdown( content, chunk_tokens=self._config.chunk_tokens, chunk_overlap=self._config.chunk_overlap, ) if not chunks: return # Check embedding cache and compute new embeddings texts_to_embed = [] chunk_hashes = [] cached_embeddings: dict[str, list[float]] = {} for chunk in chunks: # Check cache first cache_row = self._db.execute( "SELECT embedding FROM embedding_cache WHERE model = ? AND hash = ?", (self._config.embedding_model, chunk.hash), ).fetchone() if cache_row: cached_embeddings[chunk.hash] = json.loads(cache_row["embedding"]) else: texts_to_embed.append(chunk.text) chunk_hashes.append(chunk.hash) # Batch embed uncached chunks new_embeddings: dict[str, list[float]] = {} if texts_to_embed: try: vectors = embed_batch(texts_to_embed, self._config.embedding_model) now = int(time.time()) for i, chunk_hash in enumerate(chunk_hashes): vec = vectors[i] if i < len(vectors) else [] new_embeddings[chunk_hash] = vec # Store in cache self._db.execute( "INSERT OR REPLACE INTO embedding_cache " "(model, hash, embedding, dims, updated_at) " "VALUES (?, ?, ?, ?, ?)", ( self._config.embedding_model, chunk_hash, json.dumps(vec), len(vec), now, ), ) except Exception as e: logger.warning(f"Embedding batch failed for {entry.path}: {e}") # Fall back to empty embeddings for chunk_hash in chunk_hashes: new_embeddings[chunk_hash] = [] # Remove old chunks for this file self._remove_file_chunks(entry.path, source) # Insert new chunks now = int(time.time()) for chunk in chunks: chunk_id = str(uuid.uuid4()) embedding = cached_embeddings.get(chunk.hash) or new_embeddings.get( chunk.hash, [] ) self._db.execute( "INSERT INTO chunks " "(id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ( chunk_id, entry.path, source.value, chunk.start_line, chunk.end_line, chunk.hash, self._config.embedding_model, chunk.text, json.dumps(embedding), now, ), ) # Insert into FTS index if self._fts_available: try: self._db.execute( "INSERT INTO chunks_fts " "(text, id, path, source, model, start_line, end_line) " "VALUES (?, ?, ?, ?, ?, ?, ?)", ( chunk.text, chunk_id, entry.path, source.value, self._config.embedding_model, chunk.start_line, chunk.end_line, ), ) except Exception as e: logger.debug(f"FTS insert failed for chunk {chunk_id}: {e}") # Update files table self._db.execute( "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) " "VALUES (?, ?, ?, ?, ?)", ( entry.path, source.value, entry.hash, int(entry.mtime_ms), entry.size, ), ) def _remove_file_chunks(self, path: str, source: MemorySource) -> None: """Remove all chunks (and FTS entries) for a given file.""" # Get chunk IDs for FTS cleanup if self._fts_available: chunk_ids = self._db.execute( "SELECT id FROM chunks WHERE path = ? AND source = ?", (path, source.value), ).fetchall() for row in chunk_ids: try: self._db.execute( "DELETE FROM chunks_fts WHERE id = ?", (row["id"],) ) except Exception: pass self._db.execute( "DELETE FROM chunks WHERE path = ? AND source = ?", (path, source.value), ) def _remove_file(self, path: str, source: MemorySource) -> None: """Remove a file and all its chunks from the index.""" self._remove_file_chunks(path, source) self._db.execute( "DELETE FROM files WHERE path = ? AND source = ?", (path, source.value), ) # ── Session Logs ───────────────────────────────────────────────── def _sync_session_files(self, *, force: bool = False) -> int: """ Sync session log files from the daily/ directory. Returns the number of session files indexed. """ sessions_dir = Path(self._sessions_dir) if not sessions_dir.is_dir(): return 0 indexed = 0 active_paths: set[str] = set() for md_file in sorted(sessions_dir.glob("*.md")): if md_file.is_symlink() or not md_file.is_file(): continue entry = build_file_entry(str(md_file), self._workspace_dir) active_paths.add(entry.path) # Check if changed row = self._db.execute( "SELECT hash FROM files WHERE path = ? AND source = ?", (entry.path, MemorySource.SESSIONS.value), ).fetchone() if not force and row and row["hash"] == entry.hash: continue self._index_file(entry, MemorySource.SESSIONS) indexed += 1 # Clean stale session entries stale_rows = self._db.execute( "SELECT path FROM files WHERE source = ?", (MemorySource.SESSIONS.value,), ).fetchall() for stale in stale_rows: if stale["path"] not in active_paths: self._remove_file(stale["path"], MemorySource.SESSIONS) return indexed def log_session( self, content: str, *, date: str | None = None, channel: str = "slack", ) -> str: """ Append to today's session log in daily/. Args: content: The text to log (e.g., a user message or AI response). date: Optional date string (YYYY-MM-DD). Defaults to today. channel: Channel the conversation came from. Returns: Path to the session log file. """ if date is None: date = time.strftime("%Y-%m-%d") log_path = os.path.join(self._sessions_dir, f"{date}.md") # Create file with header if it doesn't exist if not os.path.exists(log_path): header = f"# Session Log — {date}\n\n" with open(log_path, "w", encoding="utf-8") as f: f.write(header) # Append the content timestamp = time.strftime("%H:%M:%S") with open(log_path, "a", encoding="utf-8") as f: f.write(f"\n---\n\n**[{timestamp}] ({channel})**\n\n{content}\n") # Mark as dirty for next sync self._dirty = True return log_path # ── Identity File Access ───────────────────────────────────────── def read_identity_file(self, name: str) -> str | None: """Read an identity file (SOUL.md, USER.md, MEMORY.md).""" filepath = os.path.join(self._workspace_dir, name) if not os.path.isfile(filepath): return None with open(filepath, "r", encoding="utf-8") as f: return f.read() def update_identity_file(self, name: str, content: str) -> None: """Update an identity file and mark index as dirty.""" filepath = os.path.join(self._workspace_dir, name) with open(filepath, "w", encoding="utf-8") as f: f.write(content) self._dirty = True logger.info(f"Updated identity file: {name}") def read_soul(self) -> str | None: return self.read_identity_file("SOUL.md") def read_user(self) -> str | None: return self.read_identity_file("USER.md") def read_long_term_memory(self) -> str | None: return self.read_identity_file("MEMORY.md") def append_to_memory(self, entry: str) -> None: """Append a new entry to MEMORY.md.""" filepath = os.path.join(self._workspace_dir, "MEMORY.md") timestamp = time.strftime("%Y-%m-%d %H:%M") with open(filepath, "a", encoding="utf-8") as f: f.write(f"\n### [{timestamp}]\n\n{entry}\n") self._dirty = True logger.info("Appended to MEMORY.md") # ── File Reading ───────────────────────────────────────────────── def read_file( self, rel_path: str, *, from_line: int | None = None, num_lines: int | None = None, ) -> dict: """ Read a memory file by relative path. Port of OpenClaw's readFile(). """ raw = rel_path.strip() if not raw: raise ValueError("path required") if os.path.isabs(raw): abs_path = os.path.realpath(raw) else: abs_path = os.path.realpath( os.path.join(self._workspace_dir, raw) ) if not abs_path.endswith(".md"): raise ValueError("Only .md files are supported") if not os.path.isfile(abs_path): raise FileNotFoundError(f"File not found: {abs_path}") with open(abs_path, "r", encoding="utf-8") as f: content = f.read() if from_line is None and num_lines is None: return {"text": content, "path": rel_path} lines = content.split("\n") start = max(1, from_line or 1) count = max(1, num_lines or len(lines)) sliced = lines[start - 1 : start - 1 + count] return {"text": "\n".join(sliced), "path": rel_path} # ── Status ─────────────────────────────────────────────────────── def status(self) -> dict: """Get the current status of the memory index.""" files_row = self._db.execute( "SELECT COUNT(*) as c FROM files" ).fetchone() chunks_row = self._db.execute( "SELECT COUNT(*) as c FROM chunks" ).fetchone() cache_row = self._db.execute( "SELECT COUNT(*) as c FROM embedding_cache" ).fetchone() return { "workspace_dir": self._workspace_dir, "db_path": self._db_path, "sessions_dir": self._sessions_dir, "files": files_row["c"] if files_row else 0, "chunks": chunks_row["c"] if chunks_row else 0, "cached_embeddings": cache_row["c"] if cache_row else 0, "fts_available": self._fts_available, "dirty": self._dirty, "embedding_model": self._config.embedding_model, "embedding_dims": get_embedding_dims(self._config.embedding_model), "vector_weight": self._config.vector_weight, "text_weight": self._config.text_weight, } # ── File Watching ──────────────────────────────────────────────── def start_watching(self) -> None: """ Start watching the workspace for file changes. Uses watchdog for cross-platform file system events. """ if self._watcher or not self._config.watch: return try: from watchdog.events import FileSystemEventHandler from watchdog.observers import Observer except ImportError: logger.warning( "watchdog not installed — file watching disabled. " "Install with: uv add watchdog" ) return manager = self class MemoryFileHandler(FileSystemEventHandler): def on_any_event(self, event): if event.is_directory: return src = getattr(event, "src_path", "") if src.endswith(".md"): manager._dirty = True logger.debug(f"Workspace change detected: {src}") observer = Observer() handler = MemoryFileHandler() observer.schedule(handler, self._workspace_dir, recursive=True) observer.start() self._watcher = observer logger.info(f"File watching started: {self._workspace_dir}") def stop_watching(self) -> None: """Stop the file watcher.""" if self._watcher: self._watcher.stop() self._watcher.join() self._watcher = None logger.info("File watching stopped") # ── Lifecycle ──────────────────────────────────────────────────── def close(self) -> None: """Close the memory manager and release resources.""" if self._closed: return self._closed = True self.stop_watching() self._db.close() logger.info("MemoryManager closed") def __enter__(self): return self def __exit__(self, *args): self.close() def __del__(self): if not self._closed: try: self.close() except Exception: pass