first commit

This commit is contained in:
Tanmay Karande
2026-02-13 23:56:09 -05:00
commit ec8bd80a3d
27 changed files with 6725 additions and 0 deletions

839
memory/manager.py Normal file
View File

@@ -0,0 +1,839 @@
"""
MemoryManager — the main memory system orchestrator.
Port of OpenClaw's MemoryIndexManager (src/memory/manager.ts, 2,300 LOC).
Lifecycle: sync → chunk → embed → store → search
Key features:
• Incremental sync — only re-indexes changed files (hash-based)
• Hybrid search — vector (0.7) + BM25 keyword (0.3)
• File watching — auto re-index on workspace changes (via watchdog)
• Embedding cache — avoids re-computing embeddings for unchanged chunks
• Session log indexing — indexes daily/ conversation transcripts
"""
import json
import logging
import os
import sqlite3
import threading
import time
import uuid
from pathlib import Path
from memory.embeddings import embed_batch, embed_query, get_embedding_dims
from memory.hybrid import bm25_rank_to_score, build_fts_query, merge_hybrid_results
from memory.internal import (
build_file_entry,
chunk_markdown,
hash_text,
list_memory_files,
)
from memory.schema import ensure_schema
from memory.types import (
MemoryConfig,
MemorySearchResult,
MemorySource,
)
logger = logging.getLogger("aetheel.memory")
SNIPPET_MAX_CHARS = 700
class MemoryManager:
"""
Main memory system — manages the full lifecycle:
sync → chunk → embed → store → search
Inspired by OpenClaw's MemoryIndexManager.
"""
def __init__(self, config: MemoryConfig | None = None):
self._config = config or MemoryConfig()
self._workspace_dir = str(
Path(self._config.workspace_dir).expanduser().resolve()
)
self._db_path = str(Path(self._config.db_path).expanduser().resolve())
self._sessions_dir = (
str(Path(self._config.sessions_dir).expanduser().resolve())
if self._config.sessions_dir
else os.path.join(self._workspace_dir, "daily")
)
# Ensure directories exist
os.makedirs(self._workspace_dir, exist_ok=True)
os.makedirs(self._sessions_dir, exist_ok=True)
os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
# Open database and ensure schema
self._db = sqlite3.connect(self._db_path, check_same_thread=False)
self._db.row_factory = sqlite3.Row
schema_result = ensure_schema(self._db, fts_enabled=True)
self._fts_available = schema_result.get("fts_available", False)
# State
self._dirty = True
self._syncing = False
self._sync_lock = threading.Lock()
self._closed = False
self._watcher = None
# Create default identity files if they don't exist
self._ensure_identity_files()
logger.info(
f"MemoryManager initialized: workspace={self._workspace_dir}, "
f"db={self._db_path}, fts={self._fts_available}"
)
# ── Identity File Bootstrap ──────────────────────────────────────
def _ensure_identity_files(self) -> None:
"""Create default SOUL.md, USER.md, MEMORY.md if they don't exist."""
defaults = {
"SOUL.md": (
"# SOUL.md — Who You Are\n\n"
"_You're not a chatbot. You're becoming someone._\n\n"
"## Core Truths\n\n"
"**Be genuinely helpful, not performatively helpful.** "
"Skip the filler — just help.\n\n"
"**Have opinions.** You're allowed to disagree, prefer things, "
"find stuff amusing or boring.\n\n"
"**Be resourceful before asking.** Try to figure it out first. "
"Then ask if you're stuck.\n\n"
"**Earn trust through competence.** Be careful with external actions. "
"Be bold with internal ones.\n\n"
"## Boundaries\n\n"
"- Private things stay private. Period.\n"
"- When in doubt, ask before acting externally.\n"
"- Never send half-baked replies.\n\n"
"## Continuity\n\n"
"Each session, you wake up fresh. These files _are_ your memory. "
"Read them. Update them. They're how you persist.\n\n"
"---\n\n"
"_This file is yours to evolve. As you learn who you are, update it._\n"
),
"USER.md": (
"# USER.md — Who I Am\n\n"
"## About Me\n\n"
"<!-- Fill in your details -->\n"
"- **Name:** \n"
"- **Role:** \n"
"- **Location:** \n"
"- **Timezone:** \n\n"
"## Preferences\n\n"
"<!-- How you like to communicate -->\n"
"- **Communication style:** \n"
"- **Response length:** \n"
"- **Technical level:** \n\n"
"## Current Focus\n\n"
"<!-- What you're working on -->\n\n"
"## Tools & Services\n\n"
"<!-- Services you use regularly -->\n\n"
"---\n\n"
"_Update this file as your preferences evolve._\n"
),
"MEMORY.md": (
"# MEMORY.md — Long-Term Memory\n\n"
"## Decisions & Lessons\n\n"
"<!-- Record important decisions and lessons learned -->\n\n"
"## Context\n\n"
"<!-- Persistent context that should carry across sessions -->\n\n"
"## Notes\n\n"
"<!-- Anything worth remembering -->\n\n"
"---\n\n"
"_This file persists across sessions. "
"Update it when you learn something important._\n"
),
}
for filename, content in defaults.items():
filepath = os.path.join(self._workspace_dir, filename)
if not os.path.exists(filepath):
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
logger.info(f"Created default identity file: {filepath}")
# ── Search ───────────────────────────────────────────────────────
async def search(
self,
query: str,
*,
max_results: int | None = None,
min_score: float | None = None,
) -> list[MemorySearchResult]:
"""
Search memory using hybrid vector + keyword search.
Port of OpenClaw's MemoryIndexManager.search().
Steps:
1. (Optional) Trigger sync if dirty
2. Run FTS5 keyword search → BM25 scored
3. Generate query embedding → vector search
4. Merge results with weighted scoring (0.7v + 0.3k)
5. Filter by min_score and return top-N results
"""
# Auto-sync if dirty
if self._config.sync_on_search and self._dirty:
await self.sync()
cleaned = query.strip()
if not cleaned:
return []
max_r = max_results or self._config.max_results
min_s = min_score if min_score is not None else self._config.min_score
candidates = min(200, max(1, max_r * 3))
# Keyword search (BM25)
keyword_results = self._search_keyword(cleaned, candidates)
# Vector search
try:
query_vec = embed_query(cleaned, self._config.embedding_model)
has_vector = any(v != 0 for v in query_vec)
except Exception as e:
logger.warning(f"Embedding failed, falling back to keyword-only: {e}")
query_vec = []
has_vector = False
vector_results = (
self._search_vector(query_vec, candidates) if has_vector else []
)
# If no keyword results, return vector-only
if not keyword_results:
return [
r for r in self._vector_to_search_results(vector_results)
if r.score >= min_s
][:max_r]
# Merge hybrid results
merged = merge_hybrid_results(
vector=vector_results,
keyword=keyword_results,
vector_weight=self._config.vector_weight,
text_weight=self._config.text_weight,
)
return [r for r in merged if r.score >= min_s][:max_r]
def _search_vector(
self, query_vec: list[float], limit: int
) -> list[dict]:
"""
Search chunks by vector cosine similarity.
Uses embedding stored as JSON in the chunks table.
"""
if not query_vec:
return []
try:
rows = self._db.execute(
"SELECT id, path, start_line, end_line, source, text, embedding "
"FROM chunks ORDER BY rowid"
).fetchall()
except Exception as e:
logger.warning(f"Vector search failed: {e}")
return []
from memory.internal import cosine_similarity
results = []
for row in rows:
try:
stored_vec = json.loads(row["embedding"])
if not stored_vec:
continue
score = cosine_similarity(query_vec, stored_vec)
snippet = row["text"][:SNIPPET_MAX_CHARS]
results.append({
"id": row["id"],
"path": row["path"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"source": row["source"],
"snippet": snippet,
"vector_score": max(0.0, score),
})
except (json.JSONDecodeError, TypeError):
continue
results.sort(key=lambda r: r["vector_score"], reverse=True)
return results[:limit]
def _search_keyword(self, query: str, limit: int) -> list[dict]:
"""
Search chunks using FTS5 full-text search with BM25 ranking.
Port of OpenClaw's searchKeyword().
"""
if not self._fts_available:
return []
fts_query = build_fts_query(query)
if not fts_query:
return []
try:
rows = self._db.execute(
"SELECT id, path, start_line, end_line, source, text, "
"rank AS bm25_rank "
"FROM chunks_fts "
"WHERE chunks_fts MATCH ? "
"ORDER BY rank "
"LIMIT ?",
(fts_query, limit),
).fetchall()
except Exception as e:
logger.debug(f"FTS search failed for query '{fts_query}': {e}")
return []
results = []
for row in rows:
# FTS5 rank is negative (lower = better), convert to 0-1 score
bm25_rank = abs(row["bm25_rank"]) if row["bm25_rank"] else 999.0
text_score = bm25_rank_to_score(bm25_rank)
snippet = row["text"][:SNIPPET_MAX_CHARS]
results.append({
"id": row["id"],
"path": row["path"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"source": row["source"],
"snippet": snippet,
"text_score": text_score,
})
return results
def _vector_to_search_results(
self, vector_results: list[dict]
) -> list[MemorySearchResult]:
"""Convert raw vector results to MemorySearchResult objects."""
return [
MemorySearchResult(
path=r["path"],
start_line=r["start_line"],
end_line=r["end_line"],
score=r["vector_score"],
snippet=r["snippet"],
source=MemorySource(r["source"]),
)
for r in vector_results
]
# ── Sync ─────────────────────────────────────────────────────────
async def sync(self, *, force: bool = False) -> dict:
"""
Synchronize workspace markdown files into the index.
Port of OpenClaw's MemoryIndexManager.sync().
Steps:
1. List all memory files (SOUL.md, USER.md, MEMORY.md, memory/*)
2. For each file, check if content hash has changed
3. If changed: chunk → embed → store in DB
4. Remove stale entries for deleted files
5. Optionally sync session logs from daily/
Returns a summary dict with counts.
"""
if self._syncing and not force:
logger.debug("Sync already in progress, skipping")
return {"skipped": True}
with self._sync_lock:
self._syncing = True
try:
return self._run_sync(force=force)
finally:
self._syncing = False
self._dirty = False
def _run_sync(self, *, force: bool = False) -> dict:
"""Execute the actual sync logic."""
stats = {
"files_found": 0,
"files_indexed": 0,
"files_skipped": 0,
"chunks_created": 0,
"stale_removed": 0,
"sessions_indexed": 0,
}
# ── Memory files ──
if "memory" in self._config.sources:
files = list_memory_files(self._workspace_dir)
stats["files_found"] = len(files)
active_paths: set[str] = set()
for abs_path in files:
entry = build_file_entry(abs_path, self._workspace_dir)
active_paths.add(entry.path)
# Check if file has changed
row = self._db.execute(
"SELECT hash FROM files WHERE path = ? AND source = ?",
(entry.path, MemorySource.MEMORY.value),
).fetchone()
if not force and row and row["hash"] == entry.hash:
stats["files_skipped"] += 1
continue
# File is new or changed — re-index it
self._index_file(entry, MemorySource.MEMORY)
stats["files_indexed"] += 1
# Remove stale entries for deleted files
stale_rows = self._db.execute(
"SELECT path FROM files WHERE source = ?",
(MemorySource.MEMORY.value,),
).fetchall()
for stale in stale_rows:
if stale["path"] not in active_paths:
self._remove_file(stale["path"], MemorySource.MEMORY)
stats["stale_removed"] += 1
# ── Session files ──
if "sessions" in self._config.sources:
session_count = self._sync_session_files(force=force)
stats["sessions_indexed"] = session_count
# Count total chunks
row = self._db.execute("SELECT COUNT(*) as c FROM chunks").fetchone()
stats["chunks_created"] = row["c"] if row else 0
self._db.commit()
logger.info(
f"Sync complete: {stats['files_indexed']} indexed, "
f"{stats['files_skipped']} unchanged, "
f"{stats['stale_removed']} removed, "
f"{stats['chunks_created']} total chunks"
)
return stats
def _index_file(self, entry, source: MemorySource) -> None:
"""
Index a single file: read → chunk → embed → store.
Port of OpenClaw's indexFile method.
"""
try:
with open(entry.abs_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
logger.warning(f"Failed to read {entry.abs_path}: {e}")
return
if not content.strip():
return
# Chunk the content
chunks = chunk_markdown(
content,
chunk_tokens=self._config.chunk_tokens,
chunk_overlap=self._config.chunk_overlap,
)
if not chunks:
return
# Check embedding cache and compute new embeddings
texts_to_embed = []
chunk_hashes = []
cached_embeddings: dict[str, list[float]] = {}
for chunk in chunks:
# Check cache first
cache_row = self._db.execute(
"SELECT embedding FROM embedding_cache WHERE model = ? AND hash = ?",
(self._config.embedding_model, chunk.hash),
).fetchone()
if cache_row:
cached_embeddings[chunk.hash] = json.loads(cache_row["embedding"])
else:
texts_to_embed.append(chunk.text)
chunk_hashes.append(chunk.hash)
# Batch embed uncached chunks
new_embeddings: dict[str, list[float]] = {}
if texts_to_embed:
try:
vectors = embed_batch(texts_to_embed, self._config.embedding_model)
now = int(time.time())
for i, chunk_hash in enumerate(chunk_hashes):
vec = vectors[i] if i < len(vectors) else []
new_embeddings[chunk_hash] = vec
# Store in cache
self._db.execute(
"INSERT OR REPLACE INTO embedding_cache "
"(model, hash, embedding, dims, updated_at) "
"VALUES (?, ?, ?, ?, ?)",
(
self._config.embedding_model,
chunk_hash,
json.dumps(vec),
len(vec),
now,
),
)
except Exception as e:
logger.warning(f"Embedding batch failed for {entry.path}: {e}")
# Fall back to empty embeddings
for chunk_hash in chunk_hashes:
new_embeddings[chunk_hash] = []
# Remove old chunks for this file
self._remove_file_chunks(entry.path, source)
# Insert new chunks
now = int(time.time())
for chunk in chunks:
chunk_id = str(uuid.uuid4())
embedding = cached_embeddings.get(chunk.hash) or new_embeddings.get(
chunk.hash, []
)
self._db.execute(
"INSERT INTO chunks "
"(id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(
chunk_id,
entry.path,
source.value,
chunk.start_line,
chunk.end_line,
chunk.hash,
self._config.embedding_model,
chunk.text,
json.dumps(embedding),
now,
),
)
# Insert into FTS index
if self._fts_available:
try:
self._db.execute(
"INSERT INTO chunks_fts "
"(text, id, path, source, model, start_line, end_line) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(
chunk.text,
chunk_id,
entry.path,
source.value,
self._config.embedding_model,
chunk.start_line,
chunk.end_line,
),
)
except Exception as e:
logger.debug(f"FTS insert failed for chunk {chunk_id}: {e}")
# Update files table
self._db.execute(
"INSERT OR REPLACE INTO files (path, source, hash, mtime, size) "
"VALUES (?, ?, ?, ?, ?)",
(
entry.path,
source.value,
entry.hash,
int(entry.mtime_ms),
entry.size,
),
)
def _remove_file_chunks(self, path: str, source: MemorySource) -> None:
"""Remove all chunks (and FTS entries) for a given file."""
# Get chunk IDs for FTS cleanup
if self._fts_available:
chunk_ids = self._db.execute(
"SELECT id FROM chunks WHERE path = ? AND source = ?",
(path, source.value),
).fetchall()
for row in chunk_ids:
try:
self._db.execute(
"DELETE FROM chunks_fts WHERE id = ?", (row["id"],)
)
except Exception:
pass
self._db.execute(
"DELETE FROM chunks WHERE path = ? AND source = ?",
(path, source.value),
)
def _remove_file(self, path: str, source: MemorySource) -> None:
"""Remove a file and all its chunks from the index."""
self._remove_file_chunks(path, source)
self._db.execute(
"DELETE FROM files WHERE path = ? AND source = ?",
(path, source.value),
)
# ── Session Logs ─────────────────────────────────────────────────
def _sync_session_files(self, *, force: bool = False) -> int:
"""
Sync session log files from the daily/ directory.
Returns the number of session files indexed.
"""
sessions_dir = Path(self._sessions_dir)
if not sessions_dir.is_dir():
return 0
indexed = 0
active_paths: set[str] = set()
for md_file in sorted(sessions_dir.glob("*.md")):
if md_file.is_symlink() or not md_file.is_file():
continue
entry = build_file_entry(str(md_file), self._workspace_dir)
active_paths.add(entry.path)
# Check if changed
row = self._db.execute(
"SELECT hash FROM files WHERE path = ? AND source = ?",
(entry.path, MemorySource.SESSIONS.value),
).fetchone()
if not force and row and row["hash"] == entry.hash:
continue
self._index_file(entry, MemorySource.SESSIONS)
indexed += 1
# Clean stale session entries
stale_rows = self._db.execute(
"SELECT path FROM files WHERE source = ?",
(MemorySource.SESSIONS.value,),
).fetchall()
for stale in stale_rows:
if stale["path"] not in active_paths:
self._remove_file(stale["path"], MemorySource.SESSIONS)
return indexed
def log_session(
self,
content: str,
*,
date: str | None = None,
channel: str = "slack",
) -> str:
"""
Append to today's session log in daily/.
Args:
content: The text to log (e.g., a user message or AI response).
date: Optional date string (YYYY-MM-DD). Defaults to today.
channel: Channel the conversation came from.
Returns:
Path to the session log file.
"""
if date is None:
date = time.strftime("%Y-%m-%d")
log_path = os.path.join(self._sessions_dir, f"{date}.md")
# Create file with header if it doesn't exist
if not os.path.exists(log_path):
header = f"# Session Log — {date}\n\n"
with open(log_path, "w", encoding="utf-8") as f:
f.write(header)
# Append the content
timestamp = time.strftime("%H:%M:%S")
with open(log_path, "a", encoding="utf-8") as f:
f.write(f"\n---\n\n**[{timestamp}] ({channel})**\n\n{content}\n")
# Mark as dirty for next sync
self._dirty = True
return log_path
# ── Identity File Access ─────────────────────────────────────────
def read_identity_file(self, name: str) -> str | None:
"""Read an identity file (SOUL.md, USER.md, MEMORY.md)."""
filepath = os.path.join(self._workspace_dir, name)
if not os.path.isfile(filepath):
return None
with open(filepath, "r", encoding="utf-8") as f:
return f.read()
def update_identity_file(self, name: str, content: str) -> None:
"""Update an identity file and mark index as dirty."""
filepath = os.path.join(self._workspace_dir, name)
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
self._dirty = True
logger.info(f"Updated identity file: {name}")
def read_soul(self) -> str | None:
return self.read_identity_file("SOUL.md")
def read_user(self) -> str | None:
return self.read_identity_file("USER.md")
def read_long_term_memory(self) -> str | None:
return self.read_identity_file("MEMORY.md")
def append_to_memory(self, entry: str) -> None:
"""Append a new entry to MEMORY.md."""
filepath = os.path.join(self._workspace_dir, "MEMORY.md")
timestamp = time.strftime("%Y-%m-%d %H:%M")
with open(filepath, "a", encoding="utf-8") as f:
f.write(f"\n### [{timestamp}]\n\n{entry}\n")
self._dirty = True
logger.info("Appended to MEMORY.md")
# ── File Reading ─────────────────────────────────────────────────
def read_file(
self,
rel_path: str,
*,
from_line: int | None = None,
num_lines: int | None = None,
) -> dict:
"""
Read a memory file by relative path.
Port of OpenClaw's readFile().
"""
raw = rel_path.strip()
if not raw:
raise ValueError("path required")
if os.path.isabs(raw):
abs_path = os.path.realpath(raw)
else:
abs_path = os.path.realpath(
os.path.join(self._workspace_dir, raw)
)
if not abs_path.endswith(".md"):
raise ValueError("Only .md files are supported")
if not os.path.isfile(abs_path):
raise FileNotFoundError(f"File not found: {abs_path}")
with open(abs_path, "r", encoding="utf-8") as f:
content = f.read()
if from_line is None and num_lines is None:
return {"text": content, "path": rel_path}
lines = content.split("\n")
start = max(1, from_line or 1)
count = max(1, num_lines or len(lines))
sliced = lines[start - 1 : start - 1 + count]
return {"text": "\n".join(sliced), "path": rel_path}
# ── Status ───────────────────────────────────────────────────────
def status(self) -> dict:
"""Get the current status of the memory index."""
files_row = self._db.execute(
"SELECT COUNT(*) as c FROM files"
).fetchone()
chunks_row = self._db.execute(
"SELECT COUNT(*) as c FROM chunks"
).fetchone()
cache_row = self._db.execute(
"SELECT COUNT(*) as c FROM embedding_cache"
).fetchone()
return {
"workspace_dir": self._workspace_dir,
"db_path": self._db_path,
"sessions_dir": self._sessions_dir,
"files": files_row["c"] if files_row else 0,
"chunks": chunks_row["c"] if chunks_row else 0,
"cached_embeddings": cache_row["c"] if cache_row else 0,
"fts_available": self._fts_available,
"dirty": self._dirty,
"embedding_model": self._config.embedding_model,
"embedding_dims": get_embedding_dims(self._config.embedding_model),
"vector_weight": self._config.vector_weight,
"text_weight": self._config.text_weight,
}
# ── File Watching ────────────────────────────────────────────────
def start_watching(self) -> None:
"""
Start watching the workspace for file changes.
Uses watchdog for cross-platform file system events.
"""
if self._watcher or not self._config.watch:
return
try:
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
except ImportError:
logger.warning(
"watchdog not installed — file watching disabled. "
"Install with: uv add watchdog"
)
return
manager = self
class MemoryFileHandler(FileSystemEventHandler):
def on_any_event(self, event):
if event.is_directory:
return
src = getattr(event, "src_path", "")
if src.endswith(".md"):
manager._dirty = True
logger.debug(f"Workspace change detected: {src}")
observer = Observer()
handler = MemoryFileHandler()
observer.schedule(handler, self._workspace_dir, recursive=True)
observer.start()
self._watcher = observer
logger.info(f"File watching started: {self._workspace_dir}")
def stop_watching(self) -> None:
"""Stop the file watcher."""
if self._watcher:
self._watcher.stop()
self._watcher.join()
self._watcher = None
logger.info("File watching stopped")
# ── Lifecycle ────────────────────────────────────────────────────
def close(self) -> None:
"""Close the memory manager and release resources."""
if self._closed:
return
self._closed = True
self.stop_watching()
self._db.close()
logger.info("MemoryManager closed")
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
def __del__(self):
if not self._closed:
try:
self.close()
except Exception:
pass