first commit

This commit is contained in:
Tanmay Karande
2026-02-13 23:56:09 -05:00
commit ec8bd80a3d
27 changed files with 6725 additions and 0 deletions

25
memory/__init__.py Normal file
View File

@@ -0,0 +1,25 @@
"""
Aetheel Memory System
=====================
Hybrid search memory with SQLite + markdown + local embeddings.
Inspired by OpenClaw's memory architecture (src/memory/):
• Identity files: SOUL.md, USER.md, MEMORY.md
• SQLite storage: chunks, FTS5, vector similarity
• Hybrid search: vector (0.7) + BM25 keyword (0.3)
• Local embeddings: fastembed ONNX (384-dim, zero API calls)
• File watching: auto re-index on workspace changes
• Session logs: daily/ conversation transcripts
Usage:
from memory import MemoryManager
manager = MemoryManager(workspace_dir="~/.aetheel/workspace")
await manager.sync()
results = await manager.search("what are my preferences?")
"""
from memory.manager import MemoryManager
from memory.types import MemorySearchResult, MemorySource
__all__ = ["MemoryManager", "MemorySearchResult", "MemorySource"]

88
memory/embeddings.py Normal file
View File

@@ -0,0 +1,88 @@
"""
Embedding provider for the memory system.
Uses fastembed (ONNX) for fully local, zero-API-call embeddings.
Inspired by OpenClaw's src/memory/embeddings.ts, simplified to:
• Single provider: fastembed with BAAI/bge-small-en-v1.5 (384-dim)
• Local only — no OpenAI/Voyage/Gemini API calls
• Thread-safe lazy initialization
"""
import logging
import threading
from memory.internal import normalize_embedding
logger = logging.getLogger("aetheel.memory.embeddings")
# The fastembed model is loaded lazily on first use
_model_lock = threading.Lock()
_model = None
_model_name: str | None = None
def _ensure_model(model_name: str = "BAAI/bge-small-en-v1.5"):
"""Lazy-load the fastembed model (thread-safe)."""
global _model, _model_name
if _model is not None and _model_name == model_name:
return _model
with _model_lock:
# Double-check after acquiring lock
if _model is not None and _model_name == model_name:
return _model
try:
from fastembed import TextEmbedding
except ImportError:
raise ImportError(
"fastembed is required for local embeddings.\n"
"Install with: uv add fastembed\n"
"Or: pip install fastembed"
)
logger.info(f"Loading embedding model: {model_name}...")
_model = TextEmbedding(model_name=model_name)
_model_name = model_name
logger.info(f"Embedding model loaded: {model_name}")
return _model
def embed_query(text: str, model_name: str = "BAAI/bge-small-en-v1.5") -> list[float]:
"""
Generate an embedding vector for a single query string.
Returns a normalized 384-dimensional vector.
"""
model = _ensure_model(model_name)
embeddings = list(model.query_embed([text]))
if not embeddings:
return []
vec = embeddings[0].tolist()
return normalize_embedding(vec)
def embed_batch(
texts: list[str],
model_name: str = "BAAI/bge-small-en-v1.5",
) -> list[list[float]]:
"""
Generate embedding vectors for a batch of text strings.
Returns a list of normalized 384-dimensional vectors.
"""
if not texts:
return []
model = _ensure_model(model_name)
embeddings = list(model.passage_embed(texts))
return [normalize_embedding(e.tolist()) for e in embeddings]
def get_embedding_dims(model_name: str = "BAAI/bge-small-en-v1.5") -> int:
"""Get the dimensionality of the embedding model."""
# Known dimensions for common models
known_dims = {
"BAAI/bge-small-en-v1.5": 384,
"BAAI/bge-base-en-v1.5": 768,
"sentence-transformers/all-MiniLM-L6-v2": 384,
}
return known_dims.get(model_name, 384)

111
memory/hybrid.py Normal file
View File

@@ -0,0 +1,111 @@
"""
Hybrid search — merges vector similarity + BM25 keyword results.
Direct port of OpenClaw's src/memory/hybrid.ts.
The algorithm:
1. Run vector search → ranked by cosine similarity
2. Run FTS5 keyword search → ranked by BM25
3. Merge by weighted score: 0.7 × vector + 0.3 × keyword
4. Deduplicate by chunk ID
5. Sort by combined score (descending)
"""
import re
from memory.types import MemorySearchResult, MemorySource
def build_fts_query(raw: str) -> str | None:
"""
Build an FTS5 match query from raw text.
Port of OpenClaw's buildFtsQuery() — quotes each token
and joins with AND for a conjunctive match.
Example: "hello world"'"hello" AND "world"'
"""
tokens = re.findall(r"[A-Za-z0-9_]+", raw)
if not tokens:
return None
quoted = [f'"{t}"' for t in tokens]
return " AND ".join(quoted)
def bm25_rank_to_score(rank: float) -> float:
"""
Convert FTS5 BM25 rank (negative = better) to a 0-1 score.
Port of OpenClaw's bm25RankToScore().
"""
normalized = max(0.0, rank) if isinstance(rank, (int, float)) else 999.0
return 1.0 / (1.0 + normalized)
def merge_hybrid_results(
vector: list[dict],
keyword: list[dict],
vector_weight: float = 0.7,
text_weight: float = 0.3,
) -> list[MemorySearchResult]:
"""
Merge vector and keyword search results with weighted scoring.
Direct port of OpenClaw's mergeHybridResults() from hybrid.ts.
Each vector result dict has: id, path, start_line, end_line, source, snippet, vector_score
Each keyword result dict has: id, path, start_line, end_line, source, snippet, text_score
"""
by_id: dict[str, dict] = {}
# Process vector results
for r in vector:
by_id[r["id"]] = {
"id": r["id"],
"path": r["path"],
"start_line": r["start_line"],
"end_line": r["end_line"],
"source": r["source"],
"snippet": r["snippet"],
"vector_score": r.get("vector_score", 0.0),
"text_score": 0.0,
}
# Process keyword results — merge with existing or create new
for r in keyword:
existing = by_id.get(r["id"])
if existing:
existing["text_score"] = r.get("text_score", 0.0)
# Prefer keyword snippet if available (often more relevant)
if r.get("snippet"):
existing["snippet"] = r["snippet"]
else:
by_id[r["id"]] = {
"id": r["id"],
"path": r["path"],
"start_line": r["start_line"],
"end_line": r["end_line"],
"source": r["source"],
"snippet": r["snippet"],
"vector_score": 0.0,
"text_score": r.get("text_score", 0.0),
}
# Compute weighted score and convert to MemorySearchResult
merged: list[MemorySearchResult] = []
for entry in by_id.values():
score = (
vector_weight * entry["vector_score"]
+ text_weight * entry["text_score"]
)
source = entry["source"]
if isinstance(source, str):
source = MemorySource(source)
merged.append(MemorySearchResult(
path=entry["path"],
start_line=entry["start_line"],
end_line=entry["end_line"],
score=score,
snippet=entry["snippet"],
source=source,
))
# Sort by score descending
merged.sort(key=lambda r: r.score, reverse=True)
return merged

214
memory/internal.py Normal file
View File

@@ -0,0 +1,214 @@
"""
Internal utilities for the memory system.
Port of OpenClaw's src/memory/internal.ts:
• hashText — SHA-256 content hashing
• chunkMarkdown — split markdown into overlapping chunks
• listMemoryFiles — discover .md files in workspace
• buildFileEntry — create MemoryFileEntry from a file
• cosineSimilarity — vector similarity calculation
"""
import hashlib
import os
from pathlib import Path
from memory.types import MemoryChunk, MemoryFileEntry
def hash_text(value: str) -> str:
"""SHA-256 hash of text content. Mirrors OpenClaw's hashText()."""
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def chunk_markdown(
content: str,
chunk_tokens: int = 512,
chunk_overlap: int = 50,
) -> list[MemoryChunk]:
"""
Split markdown content into overlapping chunks.
Direct port of OpenClaw's chunkMarkdown() from internal.ts.
Uses character-based approximation: ~4 chars per token.
"""
lines = content.split("\n")
if not lines:
return []
max_chars = max(32, chunk_tokens * 4)
overlap_chars = max(0, chunk_overlap * 4)
chunks: list[MemoryChunk] = []
current: list[tuple[str, int]] = [] # (line_text, 1-indexed line_no)
current_chars = 0
def flush() -> None:
nonlocal current, current_chars
if not current:
return
text = "\n".join(line for line, _ in current)
start_line = current[0][1]
end_line = current[-1][1]
chunks.append(MemoryChunk(
start_line=start_line,
end_line=end_line,
text=text,
hash=hash_text(text),
))
def carry_overlap() -> None:
nonlocal current, current_chars
if overlap_chars <= 0 or not current:
current = []
current_chars = 0
return
acc = 0
kept: list[tuple[str, int]] = []
for line_text, line_no in reversed(current):
acc += len(line_text) + 1
kept.insert(0, (line_text, line_no))
if acc >= overlap_chars:
break
current = kept
current_chars = sum(len(lt) + 1 for lt, _ in kept)
for i, line in enumerate(lines):
line_no = i + 1
# Handle very long lines by splitting into segments
segments = [""] if not line else [
line[start:start + max_chars]
for start in range(0, len(line), max_chars)
]
for segment in segments:
line_size = len(segment) + 1
if current_chars + line_size > max_chars and current:
flush()
carry_overlap()
current.append((segment, line_no))
current_chars += line_size
flush()
return chunks
def list_memory_files(
workspace_dir: str,
extra_paths: list[str] | None = None,
) -> list[str]:
"""
List all markdown files in the workspace memory directory.
Port of OpenClaw's listMemoryFiles() from internal.ts.
Searches for:
- MEMORY.md (or memory.md) in workspace root
- All .md files in memory/ subdirectory
- Any additional paths specified
"""
result: list[str] = []
ws = Path(workspace_dir).expanduser().resolve()
# Check MEMORY.md and memory.md in workspace root
for name in ("MEMORY.md", "memory.md"):
candidate = ws / name
if candidate.is_file() and not candidate.is_symlink():
result.append(str(candidate))
# Check SOUL.md and USER.md (identity files)
for name in ("SOUL.md", "USER.md"):
candidate = ws / name
if candidate.is_file() and not candidate.is_symlink():
result.append(str(candidate))
# Walk memory/ subdirectory
memory_dir = ws / "memory"
if memory_dir.is_dir() and not memory_dir.is_symlink():
_walk_md_files(memory_dir, result)
# Extra paths
if extra_paths:
for extra in extra_paths:
p = Path(extra).expanduser().resolve()
if p.is_symlink():
continue
if p.is_dir():
_walk_md_files(p, result)
elif p.is_file() and p.suffix == ".md":
result.append(str(p))
# Deduplicate by resolved path
seen: set[str] = set()
deduped: list[str] = []
for entry in result:
real = os.path.realpath(entry)
if real not in seen:
seen.add(real)
deduped.append(entry)
return deduped
def _walk_md_files(directory: Path, result: list[str]) -> None:
"""Recursively collect .md files from a directory."""
try:
for entry in sorted(directory.iterdir()):
if entry.is_symlink():
continue
if entry.is_dir():
_walk_md_files(entry, result)
elif entry.is_file() and entry.suffix == ".md":
result.append(str(entry))
except PermissionError:
pass
def build_file_entry(abs_path: str, workspace_dir: str) -> MemoryFileEntry:
"""
Create a MemoryFileEntry from a file path.
Port of OpenClaw's buildFileEntry() from internal.ts.
"""
stat = os.stat(abs_path)
with open(abs_path, "r", encoding="utf-8") as f:
content = f.read()
content_hash = hash_text(content)
rel_path = os.path.relpath(abs_path, workspace_dir).replace("\\", "/")
return MemoryFileEntry(
path=rel_path,
abs_path=abs_path,
mtime_ms=stat.st_mtime * 1000,
size=stat.st_size,
hash=content_hash,
)
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""
Compute cosine similarity between two vectors.
Port of OpenClaw's cosineSimilarity() from internal.ts.
"""
if not a or not b:
return 0.0
length = min(len(a), len(b))
dot = 0.0
norm_a = 0.0
norm_b = 0.0
for i in range(length):
av = a[i]
bv = b[i]
dot += av * bv
norm_a += av * av
norm_b += bv * bv
if norm_a == 0.0 or norm_b == 0.0:
return 0.0
return dot / (norm_a ** 0.5 * norm_b ** 0.5)
def normalize_embedding(vec: list[float]) -> list[float]:
"""
L2-normalize an embedding vector.
Port of OpenClaw's sanitizeAndNormalizeEmbedding().
"""
sanitized = [v if isinstance(v, (int, float)) and v == v else 0.0 for v in vec]
magnitude = sum(v * v for v in sanitized) ** 0.5
if magnitude < 1e-10:
return sanitized
return [v / magnitude for v in sanitized]

839
memory/manager.py Normal file
View File

@@ -0,0 +1,839 @@
"""
MemoryManager — the main memory system orchestrator.
Port of OpenClaw's MemoryIndexManager (src/memory/manager.ts, 2,300 LOC).
Lifecycle: sync → chunk → embed → store → search
Key features:
• Incremental sync — only re-indexes changed files (hash-based)
• Hybrid search — vector (0.7) + BM25 keyword (0.3)
• File watching — auto re-index on workspace changes (via watchdog)
• Embedding cache — avoids re-computing embeddings for unchanged chunks
• Session log indexing — indexes daily/ conversation transcripts
"""
import json
import logging
import os
import sqlite3
import threading
import time
import uuid
from pathlib import Path
from memory.embeddings import embed_batch, embed_query, get_embedding_dims
from memory.hybrid import bm25_rank_to_score, build_fts_query, merge_hybrid_results
from memory.internal import (
build_file_entry,
chunk_markdown,
hash_text,
list_memory_files,
)
from memory.schema import ensure_schema
from memory.types import (
MemoryConfig,
MemorySearchResult,
MemorySource,
)
logger = logging.getLogger("aetheel.memory")
SNIPPET_MAX_CHARS = 700
class MemoryManager:
"""
Main memory system — manages the full lifecycle:
sync → chunk → embed → store → search
Inspired by OpenClaw's MemoryIndexManager.
"""
def __init__(self, config: MemoryConfig | None = None):
self._config = config or MemoryConfig()
self._workspace_dir = str(
Path(self._config.workspace_dir).expanduser().resolve()
)
self._db_path = str(Path(self._config.db_path).expanduser().resolve())
self._sessions_dir = (
str(Path(self._config.sessions_dir).expanduser().resolve())
if self._config.sessions_dir
else os.path.join(self._workspace_dir, "daily")
)
# Ensure directories exist
os.makedirs(self._workspace_dir, exist_ok=True)
os.makedirs(self._sessions_dir, exist_ok=True)
os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
# Open database and ensure schema
self._db = sqlite3.connect(self._db_path, check_same_thread=False)
self._db.row_factory = sqlite3.Row
schema_result = ensure_schema(self._db, fts_enabled=True)
self._fts_available = schema_result.get("fts_available", False)
# State
self._dirty = True
self._syncing = False
self._sync_lock = threading.Lock()
self._closed = False
self._watcher = None
# Create default identity files if they don't exist
self._ensure_identity_files()
logger.info(
f"MemoryManager initialized: workspace={self._workspace_dir}, "
f"db={self._db_path}, fts={self._fts_available}"
)
# ── Identity File Bootstrap ──────────────────────────────────────
def _ensure_identity_files(self) -> None:
"""Create default SOUL.md, USER.md, MEMORY.md if they don't exist."""
defaults = {
"SOUL.md": (
"# SOUL.md — Who You Are\n\n"
"_You're not a chatbot. You're becoming someone._\n\n"
"## Core Truths\n\n"
"**Be genuinely helpful, not performatively helpful.** "
"Skip the filler — just help.\n\n"
"**Have opinions.** You're allowed to disagree, prefer things, "
"find stuff amusing or boring.\n\n"
"**Be resourceful before asking.** Try to figure it out first. "
"Then ask if you're stuck.\n\n"
"**Earn trust through competence.** Be careful with external actions. "
"Be bold with internal ones.\n\n"
"## Boundaries\n\n"
"- Private things stay private. Period.\n"
"- When in doubt, ask before acting externally.\n"
"- Never send half-baked replies.\n\n"
"## Continuity\n\n"
"Each session, you wake up fresh. These files _are_ your memory. "
"Read them. Update them. They're how you persist.\n\n"
"---\n\n"
"_This file is yours to evolve. As you learn who you are, update it._\n"
),
"USER.md": (
"# USER.md — Who I Am\n\n"
"## About Me\n\n"
"<!-- Fill in your details -->\n"
"- **Name:** \n"
"- **Role:** \n"
"- **Location:** \n"
"- **Timezone:** \n\n"
"## Preferences\n\n"
"<!-- How you like to communicate -->\n"
"- **Communication style:** \n"
"- **Response length:** \n"
"- **Technical level:** \n\n"
"## Current Focus\n\n"
"<!-- What you're working on -->\n\n"
"## Tools & Services\n\n"
"<!-- Services you use regularly -->\n\n"
"---\n\n"
"_Update this file as your preferences evolve._\n"
),
"MEMORY.md": (
"# MEMORY.md — Long-Term Memory\n\n"
"## Decisions & Lessons\n\n"
"<!-- Record important decisions and lessons learned -->\n\n"
"## Context\n\n"
"<!-- Persistent context that should carry across sessions -->\n\n"
"## Notes\n\n"
"<!-- Anything worth remembering -->\n\n"
"---\n\n"
"_This file persists across sessions. "
"Update it when you learn something important._\n"
),
}
for filename, content in defaults.items():
filepath = os.path.join(self._workspace_dir, filename)
if not os.path.exists(filepath):
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
logger.info(f"Created default identity file: {filepath}")
# ── Search ───────────────────────────────────────────────────────
async def search(
self,
query: str,
*,
max_results: int | None = None,
min_score: float | None = None,
) -> list[MemorySearchResult]:
"""
Search memory using hybrid vector + keyword search.
Port of OpenClaw's MemoryIndexManager.search().
Steps:
1. (Optional) Trigger sync if dirty
2. Run FTS5 keyword search → BM25 scored
3. Generate query embedding → vector search
4. Merge results with weighted scoring (0.7v + 0.3k)
5. Filter by min_score and return top-N results
"""
# Auto-sync if dirty
if self._config.sync_on_search and self._dirty:
await self.sync()
cleaned = query.strip()
if not cleaned:
return []
max_r = max_results or self._config.max_results
min_s = min_score if min_score is not None else self._config.min_score
candidates = min(200, max(1, max_r * 3))
# Keyword search (BM25)
keyword_results = self._search_keyword(cleaned, candidates)
# Vector search
try:
query_vec = embed_query(cleaned, self._config.embedding_model)
has_vector = any(v != 0 for v in query_vec)
except Exception as e:
logger.warning(f"Embedding failed, falling back to keyword-only: {e}")
query_vec = []
has_vector = False
vector_results = (
self._search_vector(query_vec, candidates) if has_vector else []
)
# If no keyword results, return vector-only
if not keyword_results:
return [
r for r in self._vector_to_search_results(vector_results)
if r.score >= min_s
][:max_r]
# Merge hybrid results
merged = merge_hybrid_results(
vector=vector_results,
keyword=keyword_results,
vector_weight=self._config.vector_weight,
text_weight=self._config.text_weight,
)
return [r for r in merged if r.score >= min_s][:max_r]
def _search_vector(
self, query_vec: list[float], limit: int
) -> list[dict]:
"""
Search chunks by vector cosine similarity.
Uses embedding stored as JSON in the chunks table.
"""
if not query_vec:
return []
try:
rows = self._db.execute(
"SELECT id, path, start_line, end_line, source, text, embedding "
"FROM chunks ORDER BY rowid"
).fetchall()
except Exception as e:
logger.warning(f"Vector search failed: {e}")
return []
from memory.internal import cosine_similarity
results = []
for row in rows:
try:
stored_vec = json.loads(row["embedding"])
if not stored_vec:
continue
score = cosine_similarity(query_vec, stored_vec)
snippet = row["text"][:SNIPPET_MAX_CHARS]
results.append({
"id": row["id"],
"path": row["path"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"source": row["source"],
"snippet": snippet,
"vector_score": max(0.0, score),
})
except (json.JSONDecodeError, TypeError):
continue
results.sort(key=lambda r: r["vector_score"], reverse=True)
return results[:limit]
def _search_keyword(self, query: str, limit: int) -> list[dict]:
"""
Search chunks using FTS5 full-text search with BM25 ranking.
Port of OpenClaw's searchKeyword().
"""
if not self._fts_available:
return []
fts_query = build_fts_query(query)
if not fts_query:
return []
try:
rows = self._db.execute(
"SELECT id, path, start_line, end_line, source, text, "
"rank AS bm25_rank "
"FROM chunks_fts "
"WHERE chunks_fts MATCH ? "
"ORDER BY rank "
"LIMIT ?",
(fts_query, limit),
).fetchall()
except Exception as e:
logger.debug(f"FTS search failed for query '{fts_query}': {e}")
return []
results = []
for row in rows:
# FTS5 rank is negative (lower = better), convert to 0-1 score
bm25_rank = abs(row["bm25_rank"]) if row["bm25_rank"] else 999.0
text_score = bm25_rank_to_score(bm25_rank)
snippet = row["text"][:SNIPPET_MAX_CHARS]
results.append({
"id": row["id"],
"path": row["path"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"source": row["source"],
"snippet": snippet,
"text_score": text_score,
})
return results
def _vector_to_search_results(
self, vector_results: list[dict]
) -> list[MemorySearchResult]:
"""Convert raw vector results to MemorySearchResult objects."""
return [
MemorySearchResult(
path=r["path"],
start_line=r["start_line"],
end_line=r["end_line"],
score=r["vector_score"],
snippet=r["snippet"],
source=MemorySource(r["source"]),
)
for r in vector_results
]
# ── Sync ─────────────────────────────────────────────────────────
async def sync(self, *, force: bool = False) -> dict:
"""
Synchronize workspace markdown files into the index.
Port of OpenClaw's MemoryIndexManager.sync().
Steps:
1. List all memory files (SOUL.md, USER.md, MEMORY.md, memory/*)
2. For each file, check if content hash has changed
3. If changed: chunk → embed → store in DB
4. Remove stale entries for deleted files
5. Optionally sync session logs from daily/
Returns a summary dict with counts.
"""
if self._syncing and not force:
logger.debug("Sync already in progress, skipping")
return {"skipped": True}
with self._sync_lock:
self._syncing = True
try:
return self._run_sync(force=force)
finally:
self._syncing = False
self._dirty = False
def _run_sync(self, *, force: bool = False) -> dict:
"""Execute the actual sync logic."""
stats = {
"files_found": 0,
"files_indexed": 0,
"files_skipped": 0,
"chunks_created": 0,
"stale_removed": 0,
"sessions_indexed": 0,
}
# ── Memory files ──
if "memory" in self._config.sources:
files = list_memory_files(self._workspace_dir)
stats["files_found"] = len(files)
active_paths: set[str] = set()
for abs_path in files:
entry = build_file_entry(abs_path, self._workspace_dir)
active_paths.add(entry.path)
# Check if file has changed
row = self._db.execute(
"SELECT hash FROM files WHERE path = ? AND source = ?",
(entry.path, MemorySource.MEMORY.value),
).fetchone()
if not force and row and row["hash"] == entry.hash:
stats["files_skipped"] += 1
continue
# File is new or changed — re-index it
self._index_file(entry, MemorySource.MEMORY)
stats["files_indexed"] += 1
# Remove stale entries for deleted files
stale_rows = self._db.execute(
"SELECT path FROM files WHERE source = ?",
(MemorySource.MEMORY.value,),
).fetchall()
for stale in stale_rows:
if stale["path"] not in active_paths:
self._remove_file(stale["path"], MemorySource.MEMORY)
stats["stale_removed"] += 1
# ── Session files ──
if "sessions" in self._config.sources:
session_count = self._sync_session_files(force=force)
stats["sessions_indexed"] = session_count
# Count total chunks
row = self._db.execute("SELECT COUNT(*) as c FROM chunks").fetchone()
stats["chunks_created"] = row["c"] if row else 0
self._db.commit()
logger.info(
f"Sync complete: {stats['files_indexed']} indexed, "
f"{stats['files_skipped']} unchanged, "
f"{stats['stale_removed']} removed, "
f"{stats['chunks_created']} total chunks"
)
return stats
def _index_file(self, entry, source: MemorySource) -> None:
"""
Index a single file: read → chunk → embed → store.
Port of OpenClaw's indexFile method.
"""
try:
with open(entry.abs_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
logger.warning(f"Failed to read {entry.abs_path}: {e}")
return
if not content.strip():
return
# Chunk the content
chunks = chunk_markdown(
content,
chunk_tokens=self._config.chunk_tokens,
chunk_overlap=self._config.chunk_overlap,
)
if not chunks:
return
# Check embedding cache and compute new embeddings
texts_to_embed = []
chunk_hashes = []
cached_embeddings: dict[str, list[float]] = {}
for chunk in chunks:
# Check cache first
cache_row = self._db.execute(
"SELECT embedding FROM embedding_cache WHERE model = ? AND hash = ?",
(self._config.embedding_model, chunk.hash),
).fetchone()
if cache_row:
cached_embeddings[chunk.hash] = json.loads(cache_row["embedding"])
else:
texts_to_embed.append(chunk.text)
chunk_hashes.append(chunk.hash)
# Batch embed uncached chunks
new_embeddings: dict[str, list[float]] = {}
if texts_to_embed:
try:
vectors = embed_batch(texts_to_embed, self._config.embedding_model)
now = int(time.time())
for i, chunk_hash in enumerate(chunk_hashes):
vec = vectors[i] if i < len(vectors) else []
new_embeddings[chunk_hash] = vec
# Store in cache
self._db.execute(
"INSERT OR REPLACE INTO embedding_cache "
"(model, hash, embedding, dims, updated_at) "
"VALUES (?, ?, ?, ?, ?)",
(
self._config.embedding_model,
chunk_hash,
json.dumps(vec),
len(vec),
now,
),
)
except Exception as e:
logger.warning(f"Embedding batch failed for {entry.path}: {e}")
# Fall back to empty embeddings
for chunk_hash in chunk_hashes:
new_embeddings[chunk_hash] = []
# Remove old chunks for this file
self._remove_file_chunks(entry.path, source)
# Insert new chunks
now = int(time.time())
for chunk in chunks:
chunk_id = str(uuid.uuid4())
embedding = cached_embeddings.get(chunk.hash) or new_embeddings.get(
chunk.hash, []
)
self._db.execute(
"INSERT INTO chunks "
"(id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(
chunk_id,
entry.path,
source.value,
chunk.start_line,
chunk.end_line,
chunk.hash,
self._config.embedding_model,
chunk.text,
json.dumps(embedding),
now,
),
)
# Insert into FTS index
if self._fts_available:
try:
self._db.execute(
"INSERT INTO chunks_fts "
"(text, id, path, source, model, start_line, end_line) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(
chunk.text,
chunk_id,
entry.path,
source.value,
self._config.embedding_model,
chunk.start_line,
chunk.end_line,
),
)
except Exception as e:
logger.debug(f"FTS insert failed for chunk {chunk_id}: {e}")
# Update files table
self._db.execute(
"INSERT OR REPLACE INTO files (path, source, hash, mtime, size) "
"VALUES (?, ?, ?, ?, ?)",
(
entry.path,
source.value,
entry.hash,
int(entry.mtime_ms),
entry.size,
),
)
def _remove_file_chunks(self, path: str, source: MemorySource) -> None:
"""Remove all chunks (and FTS entries) for a given file."""
# Get chunk IDs for FTS cleanup
if self._fts_available:
chunk_ids = self._db.execute(
"SELECT id FROM chunks WHERE path = ? AND source = ?",
(path, source.value),
).fetchall()
for row in chunk_ids:
try:
self._db.execute(
"DELETE FROM chunks_fts WHERE id = ?", (row["id"],)
)
except Exception:
pass
self._db.execute(
"DELETE FROM chunks WHERE path = ? AND source = ?",
(path, source.value),
)
def _remove_file(self, path: str, source: MemorySource) -> None:
"""Remove a file and all its chunks from the index."""
self._remove_file_chunks(path, source)
self._db.execute(
"DELETE FROM files WHERE path = ? AND source = ?",
(path, source.value),
)
# ── Session Logs ─────────────────────────────────────────────────
def _sync_session_files(self, *, force: bool = False) -> int:
"""
Sync session log files from the daily/ directory.
Returns the number of session files indexed.
"""
sessions_dir = Path(self._sessions_dir)
if not sessions_dir.is_dir():
return 0
indexed = 0
active_paths: set[str] = set()
for md_file in sorted(sessions_dir.glob("*.md")):
if md_file.is_symlink() or not md_file.is_file():
continue
entry = build_file_entry(str(md_file), self._workspace_dir)
active_paths.add(entry.path)
# Check if changed
row = self._db.execute(
"SELECT hash FROM files WHERE path = ? AND source = ?",
(entry.path, MemorySource.SESSIONS.value),
).fetchone()
if not force and row and row["hash"] == entry.hash:
continue
self._index_file(entry, MemorySource.SESSIONS)
indexed += 1
# Clean stale session entries
stale_rows = self._db.execute(
"SELECT path FROM files WHERE source = ?",
(MemorySource.SESSIONS.value,),
).fetchall()
for stale in stale_rows:
if stale["path"] not in active_paths:
self._remove_file(stale["path"], MemorySource.SESSIONS)
return indexed
def log_session(
self,
content: str,
*,
date: str | None = None,
channel: str = "slack",
) -> str:
"""
Append to today's session log in daily/.
Args:
content: The text to log (e.g., a user message or AI response).
date: Optional date string (YYYY-MM-DD). Defaults to today.
channel: Channel the conversation came from.
Returns:
Path to the session log file.
"""
if date is None:
date = time.strftime("%Y-%m-%d")
log_path = os.path.join(self._sessions_dir, f"{date}.md")
# Create file with header if it doesn't exist
if not os.path.exists(log_path):
header = f"# Session Log — {date}\n\n"
with open(log_path, "w", encoding="utf-8") as f:
f.write(header)
# Append the content
timestamp = time.strftime("%H:%M:%S")
with open(log_path, "a", encoding="utf-8") as f:
f.write(f"\n---\n\n**[{timestamp}] ({channel})**\n\n{content}\n")
# Mark as dirty for next sync
self._dirty = True
return log_path
# ── Identity File Access ─────────────────────────────────────────
def read_identity_file(self, name: str) -> str | None:
"""Read an identity file (SOUL.md, USER.md, MEMORY.md)."""
filepath = os.path.join(self._workspace_dir, name)
if not os.path.isfile(filepath):
return None
with open(filepath, "r", encoding="utf-8") as f:
return f.read()
def update_identity_file(self, name: str, content: str) -> None:
"""Update an identity file and mark index as dirty."""
filepath = os.path.join(self._workspace_dir, name)
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
self._dirty = True
logger.info(f"Updated identity file: {name}")
def read_soul(self) -> str | None:
return self.read_identity_file("SOUL.md")
def read_user(self) -> str | None:
return self.read_identity_file("USER.md")
def read_long_term_memory(self) -> str | None:
return self.read_identity_file("MEMORY.md")
def append_to_memory(self, entry: str) -> None:
"""Append a new entry to MEMORY.md."""
filepath = os.path.join(self._workspace_dir, "MEMORY.md")
timestamp = time.strftime("%Y-%m-%d %H:%M")
with open(filepath, "a", encoding="utf-8") as f:
f.write(f"\n### [{timestamp}]\n\n{entry}\n")
self._dirty = True
logger.info("Appended to MEMORY.md")
# ── File Reading ─────────────────────────────────────────────────
def read_file(
self,
rel_path: str,
*,
from_line: int | None = None,
num_lines: int | None = None,
) -> dict:
"""
Read a memory file by relative path.
Port of OpenClaw's readFile().
"""
raw = rel_path.strip()
if not raw:
raise ValueError("path required")
if os.path.isabs(raw):
abs_path = os.path.realpath(raw)
else:
abs_path = os.path.realpath(
os.path.join(self._workspace_dir, raw)
)
if not abs_path.endswith(".md"):
raise ValueError("Only .md files are supported")
if not os.path.isfile(abs_path):
raise FileNotFoundError(f"File not found: {abs_path}")
with open(abs_path, "r", encoding="utf-8") as f:
content = f.read()
if from_line is None and num_lines is None:
return {"text": content, "path": rel_path}
lines = content.split("\n")
start = max(1, from_line or 1)
count = max(1, num_lines or len(lines))
sliced = lines[start - 1 : start - 1 + count]
return {"text": "\n".join(sliced), "path": rel_path}
# ── Status ───────────────────────────────────────────────────────
def status(self) -> dict:
"""Get the current status of the memory index."""
files_row = self._db.execute(
"SELECT COUNT(*) as c FROM files"
).fetchone()
chunks_row = self._db.execute(
"SELECT COUNT(*) as c FROM chunks"
).fetchone()
cache_row = self._db.execute(
"SELECT COUNT(*) as c FROM embedding_cache"
).fetchone()
return {
"workspace_dir": self._workspace_dir,
"db_path": self._db_path,
"sessions_dir": self._sessions_dir,
"files": files_row["c"] if files_row else 0,
"chunks": chunks_row["c"] if chunks_row else 0,
"cached_embeddings": cache_row["c"] if cache_row else 0,
"fts_available": self._fts_available,
"dirty": self._dirty,
"embedding_model": self._config.embedding_model,
"embedding_dims": get_embedding_dims(self._config.embedding_model),
"vector_weight": self._config.vector_weight,
"text_weight": self._config.text_weight,
}
# ── File Watching ────────────────────────────────────────────────
def start_watching(self) -> None:
"""
Start watching the workspace for file changes.
Uses watchdog for cross-platform file system events.
"""
if self._watcher or not self._config.watch:
return
try:
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
except ImportError:
logger.warning(
"watchdog not installed — file watching disabled. "
"Install with: uv add watchdog"
)
return
manager = self
class MemoryFileHandler(FileSystemEventHandler):
def on_any_event(self, event):
if event.is_directory:
return
src = getattr(event, "src_path", "")
if src.endswith(".md"):
manager._dirty = True
logger.debug(f"Workspace change detected: {src}")
observer = Observer()
handler = MemoryFileHandler()
observer.schedule(handler, self._workspace_dir, recursive=True)
observer.start()
self._watcher = observer
logger.info(f"File watching started: {self._workspace_dir}")
def stop_watching(self) -> None:
"""Stop the file watcher."""
if self._watcher:
self._watcher.stop()
self._watcher.join()
self._watcher = None
logger.info("File watching stopped")
# ── Lifecycle ────────────────────────────────────────────────────
def close(self) -> None:
"""Close the memory manager and release resources."""
if self._closed:
return
self._closed = True
self.stop_watching()
self._db.close()
logger.info("MemoryManager closed")
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
def __del__(self):
if not self._closed:
try:
self.close()
except Exception:
pass

124
memory/schema.py Normal file
View File

@@ -0,0 +1,124 @@
"""
SQLite schema for the memory system.
Port of OpenClaw's src/memory/memory-schema.ts.
Tables:
• meta — key-value store for index metadata
• files — tracked files with content hashes (for incremental sync)
• chunks — text chunks with embeddings
• chunks_fts — FTS5 virtual table for keyword/BM25 search
• chunks_vec — sqlite-vec virtual table for vector similarity (optional)
"""
import logging
import sqlite3
logger = logging.getLogger("aetheel.memory.schema")
def ensure_schema(
db: sqlite3.Connection,
*,
fts_enabled: bool = True,
) -> dict:
"""
Create all required tables if they don't exist.
Returns a dict with 'fts_available' and optionally 'fts_error'.
"""
# Meta table — stores index config (model, dimensions, etc.)
db.execute("""
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)
""")
# Files table — tracks which files have been indexed and their content hash
db.execute("""
CREATE TABLE IF NOT EXISTS files (
path TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'memory',
hash TEXT NOT NULL,
mtime INTEGER NOT NULL,
size INTEGER NOT NULL,
PRIMARY KEY (path, source)
)
""")
# Chunks table — stores text chunks and their embeddings
db.execute("""
CREATE TABLE IF NOT EXISTS chunks (
id TEXT PRIMARY KEY,
path TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'memory',
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
hash TEXT NOT NULL,
model TEXT NOT NULL,
text TEXT NOT NULL,
embedding TEXT NOT NULL,
updated_at INTEGER NOT NULL
)
""")
# Indices for efficient lookups
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path)")
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source)")
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_hash ON chunks(hash)")
# FTS5 full-text search table for keyword/BM25 matching
fts_available = False
fts_error = None
if fts_enabled:
try:
db.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
text,
id UNINDEXED,
path UNINDEXED,
source UNINDEXED,
model UNINDEXED,
start_line UNINDEXED,
end_line UNINDEXED
)
""")
fts_available = True
except Exception as e:
fts_error = str(e)
logger.warning(f"FTS5 unavailable: {fts_error}")
# Embedding cache table — avoids re-computing embeddings
db.execute("""
CREATE TABLE IF NOT EXISTS embedding_cache (
model TEXT NOT NULL,
hash TEXT NOT NULL,
embedding TEXT NOT NULL,
dims INTEGER,
updated_at INTEGER NOT NULL,
PRIMARY KEY (model, hash)
)
""")
db.execute(
"CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at "
"ON embedding_cache(updated_at)"
)
# Session logs table — tracks daily session transcripts
db.execute("""
CREATE TABLE IF NOT EXISTS session_logs (
session_date TEXT NOT NULL,
channel TEXT NOT NULL DEFAULT 'slack',
user_id TEXT,
summary TEXT,
raw_transcript TEXT,
created_at INTEGER NOT NULL,
PRIMARY KEY (session_date, channel)
)
""")
db.commit()
result = {"fts_available": fts_available}
if fts_error:
result["fts_error"] = fts_error
return result

104
memory/types.py Normal file
View File

@@ -0,0 +1,104 @@
"""
Memory system types — mirrors OpenClaw's src/memory/types.ts.
"""
from dataclasses import dataclass, field
from enum import Enum
class MemorySource(str, Enum):
"""Source of a memory entry — either workspace markdown or session logs."""
MEMORY = "memory"
SESSIONS = "sessions"
@dataclass
class MemorySearchResult:
"""
A single search result from the memory system.
Mirrors OpenClaw's MemorySearchResult type.
"""
path: str
start_line: int
end_line: int
score: float
snippet: str
source: MemorySource
citation: str | None = None
@dataclass
class MemoryChunk:
"""
A chunk of text extracted from a markdown file.
Mirrors OpenClaw's MemoryChunk from internal.ts.
"""
start_line: int
end_line: int
text: str
hash: str
@dataclass
class MemoryFileEntry:
"""
Metadata about an indexed markdown file.
Mirrors OpenClaw's MemoryFileEntry from internal.ts.
"""
path: str # relative path within workspace
abs_path: str # absolute filesystem path
mtime_ms: float # modification time (ms since epoch)
size: int # file size in bytes
hash: str # SHA-256 of file content
@dataclass
class SessionFileEntry:
"""
Metadata about an indexed session transcript file.
Mirrors OpenClaw's SessionFileEntry from session-files.ts.
"""
path: str # relative path (sessions/<filename>)
abs_path: str # absolute filesystem path
mtime_ms: float
size: int
hash: str
content: str # extracted text content
line_map: list[int] = field(default_factory=list)
@dataclass
class MemoryConfig:
"""
Configuration for the memory system.
"""
# Workspace directory containing SOUL.md, USER.md, MEMORY.md, etc.
workspace_dir: str = "~/.aetheel/workspace"
# SQLite database path (created automatically)
db_path: str = "~/.aetheel/memory.db"
# Chunking
chunk_tokens: int = 512
chunk_overlap: int = 50
# Search
max_results: int = 10
min_score: float = 0.1
vector_weight: float = 0.7
text_weight: float = 0.3
# Embedding
embedding_model: str = "BAAI/bge-small-en-v1.5"
embedding_dims: int = 384
# Sync
watch: bool = True
watch_debounce_ms: int = 2000
sync_on_search: bool = True
# Session logs
sessions_dir: str | None = None # defaults to workspace_dir/daily/
# Sources to index
sources: list[str] = field(default_factory=lambda: ["memory", "sessions"])