first commit
This commit is contained in:
25
memory/__init__.py
Normal file
25
memory/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
Aetheel Memory System
|
||||
=====================
|
||||
Hybrid search memory with SQLite + markdown + local embeddings.
|
||||
|
||||
Inspired by OpenClaw's memory architecture (src/memory/):
|
||||
• Identity files: SOUL.md, USER.md, MEMORY.md
|
||||
• SQLite storage: chunks, FTS5, vector similarity
|
||||
• Hybrid search: vector (0.7) + BM25 keyword (0.3)
|
||||
• Local embeddings: fastembed ONNX (384-dim, zero API calls)
|
||||
• File watching: auto re-index on workspace changes
|
||||
• Session logs: daily/ conversation transcripts
|
||||
|
||||
Usage:
|
||||
from memory import MemoryManager
|
||||
|
||||
manager = MemoryManager(workspace_dir="~/.aetheel/workspace")
|
||||
await manager.sync()
|
||||
results = await manager.search("what are my preferences?")
|
||||
"""
|
||||
|
||||
from memory.manager import MemoryManager
|
||||
from memory.types import MemorySearchResult, MemorySource
|
||||
|
||||
__all__ = ["MemoryManager", "MemorySearchResult", "MemorySource"]
|
||||
88
memory/embeddings.py
Normal file
88
memory/embeddings.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Embedding provider for the memory system.
|
||||
Uses fastembed (ONNX) for fully local, zero-API-call embeddings.
|
||||
|
||||
Inspired by OpenClaw's src/memory/embeddings.ts, simplified to:
|
||||
• Single provider: fastembed with BAAI/bge-small-en-v1.5 (384-dim)
|
||||
• Local only — no OpenAI/Voyage/Gemini API calls
|
||||
• Thread-safe lazy initialization
|
||||
"""
|
||||
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from memory.internal import normalize_embedding
|
||||
|
||||
logger = logging.getLogger("aetheel.memory.embeddings")
|
||||
|
||||
# The fastembed model is loaded lazily on first use
|
||||
_model_lock = threading.Lock()
|
||||
_model = None
|
||||
_model_name: str | None = None
|
||||
|
||||
|
||||
def _ensure_model(model_name: str = "BAAI/bge-small-en-v1.5"):
|
||||
"""Lazy-load the fastembed model (thread-safe)."""
|
||||
global _model, _model_name
|
||||
|
||||
if _model is not None and _model_name == model_name:
|
||||
return _model
|
||||
|
||||
with _model_lock:
|
||||
# Double-check after acquiring lock
|
||||
if _model is not None and _model_name == model_name:
|
||||
return _model
|
||||
|
||||
try:
|
||||
from fastembed import TextEmbedding
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"fastembed is required for local embeddings.\n"
|
||||
"Install with: uv add fastembed\n"
|
||||
"Or: pip install fastembed"
|
||||
)
|
||||
|
||||
logger.info(f"Loading embedding model: {model_name}...")
|
||||
_model = TextEmbedding(model_name=model_name)
|
||||
_model_name = model_name
|
||||
logger.info(f"Embedding model loaded: {model_name}")
|
||||
return _model
|
||||
|
||||
|
||||
def embed_query(text: str, model_name: str = "BAAI/bge-small-en-v1.5") -> list[float]:
|
||||
"""
|
||||
Generate an embedding vector for a single query string.
|
||||
Returns a normalized 384-dimensional vector.
|
||||
"""
|
||||
model = _ensure_model(model_name)
|
||||
embeddings = list(model.query_embed([text]))
|
||||
if not embeddings:
|
||||
return []
|
||||
vec = embeddings[0].tolist()
|
||||
return normalize_embedding(vec)
|
||||
|
||||
|
||||
def embed_batch(
|
||||
texts: list[str],
|
||||
model_name: str = "BAAI/bge-small-en-v1.5",
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embedding vectors for a batch of text strings.
|
||||
Returns a list of normalized 384-dimensional vectors.
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
model = _ensure_model(model_name)
|
||||
embeddings = list(model.passage_embed(texts))
|
||||
return [normalize_embedding(e.tolist()) for e in embeddings]
|
||||
|
||||
|
||||
def get_embedding_dims(model_name: str = "BAAI/bge-small-en-v1.5") -> int:
|
||||
"""Get the dimensionality of the embedding model."""
|
||||
# Known dimensions for common models
|
||||
known_dims = {
|
||||
"BAAI/bge-small-en-v1.5": 384,
|
||||
"BAAI/bge-base-en-v1.5": 768,
|
||||
"sentence-transformers/all-MiniLM-L6-v2": 384,
|
||||
}
|
||||
return known_dims.get(model_name, 384)
|
||||
111
memory/hybrid.py
Normal file
111
memory/hybrid.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Hybrid search — merges vector similarity + BM25 keyword results.
|
||||
Direct port of OpenClaw's src/memory/hybrid.ts.
|
||||
|
||||
The algorithm:
|
||||
1. Run vector search → ranked by cosine similarity
|
||||
2. Run FTS5 keyword search → ranked by BM25
|
||||
3. Merge by weighted score: 0.7 × vector + 0.3 × keyword
|
||||
4. Deduplicate by chunk ID
|
||||
5. Sort by combined score (descending)
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from memory.types import MemorySearchResult, MemorySource
|
||||
|
||||
|
||||
def build_fts_query(raw: str) -> str | None:
|
||||
"""
|
||||
Build an FTS5 match query from raw text.
|
||||
Port of OpenClaw's buildFtsQuery() — quotes each token
|
||||
and joins with AND for a conjunctive match.
|
||||
|
||||
Example: "hello world" → '"hello" AND "world"'
|
||||
"""
|
||||
tokens = re.findall(r"[A-Za-z0-9_]+", raw)
|
||||
if not tokens:
|
||||
return None
|
||||
quoted = [f'"{t}"' for t in tokens]
|
||||
return " AND ".join(quoted)
|
||||
|
||||
|
||||
def bm25_rank_to_score(rank: float) -> float:
|
||||
"""
|
||||
Convert FTS5 BM25 rank (negative = better) to a 0-1 score.
|
||||
Port of OpenClaw's bm25RankToScore().
|
||||
"""
|
||||
normalized = max(0.0, rank) if isinstance(rank, (int, float)) else 999.0
|
||||
return 1.0 / (1.0 + normalized)
|
||||
|
||||
|
||||
def merge_hybrid_results(
|
||||
vector: list[dict],
|
||||
keyword: list[dict],
|
||||
vector_weight: float = 0.7,
|
||||
text_weight: float = 0.3,
|
||||
) -> list[MemorySearchResult]:
|
||||
"""
|
||||
Merge vector and keyword search results with weighted scoring.
|
||||
Direct port of OpenClaw's mergeHybridResults() from hybrid.ts.
|
||||
|
||||
Each vector result dict has: id, path, start_line, end_line, source, snippet, vector_score
|
||||
Each keyword result dict has: id, path, start_line, end_line, source, snippet, text_score
|
||||
"""
|
||||
by_id: dict[str, dict] = {}
|
||||
|
||||
# Process vector results
|
||||
for r in vector:
|
||||
by_id[r["id"]] = {
|
||||
"id": r["id"],
|
||||
"path": r["path"],
|
||||
"start_line": r["start_line"],
|
||||
"end_line": r["end_line"],
|
||||
"source": r["source"],
|
||||
"snippet": r["snippet"],
|
||||
"vector_score": r.get("vector_score", 0.0),
|
||||
"text_score": 0.0,
|
||||
}
|
||||
|
||||
# Process keyword results — merge with existing or create new
|
||||
for r in keyword:
|
||||
existing = by_id.get(r["id"])
|
||||
if existing:
|
||||
existing["text_score"] = r.get("text_score", 0.0)
|
||||
# Prefer keyword snippet if available (often more relevant)
|
||||
if r.get("snippet"):
|
||||
existing["snippet"] = r["snippet"]
|
||||
else:
|
||||
by_id[r["id"]] = {
|
||||
"id": r["id"],
|
||||
"path": r["path"],
|
||||
"start_line": r["start_line"],
|
||||
"end_line": r["end_line"],
|
||||
"source": r["source"],
|
||||
"snippet": r["snippet"],
|
||||
"vector_score": 0.0,
|
||||
"text_score": r.get("text_score", 0.0),
|
||||
}
|
||||
|
||||
# Compute weighted score and convert to MemorySearchResult
|
||||
merged: list[MemorySearchResult] = []
|
||||
for entry in by_id.values():
|
||||
score = (
|
||||
vector_weight * entry["vector_score"]
|
||||
+ text_weight * entry["text_score"]
|
||||
)
|
||||
source = entry["source"]
|
||||
if isinstance(source, str):
|
||||
source = MemorySource(source)
|
||||
merged.append(MemorySearchResult(
|
||||
path=entry["path"],
|
||||
start_line=entry["start_line"],
|
||||
end_line=entry["end_line"],
|
||||
score=score,
|
||||
snippet=entry["snippet"],
|
||||
source=source,
|
||||
))
|
||||
|
||||
# Sort by score descending
|
||||
merged.sort(key=lambda r: r.score, reverse=True)
|
||||
return merged
|
||||
214
memory/internal.py
Normal file
214
memory/internal.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
Internal utilities for the memory system.
|
||||
Port of OpenClaw's src/memory/internal.ts:
|
||||
• hashText — SHA-256 content hashing
|
||||
• chunkMarkdown — split markdown into overlapping chunks
|
||||
• listMemoryFiles — discover .md files in workspace
|
||||
• buildFileEntry — create MemoryFileEntry from a file
|
||||
• cosineSimilarity — vector similarity calculation
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from memory.types import MemoryChunk, MemoryFileEntry
|
||||
|
||||
|
||||
def hash_text(value: str) -> str:
|
||||
"""SHA-256 hash of text content. Mirrors OpenClaw's hashText()."""
|
||||
return hashlib.sha256(value.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def chunk_markdown(
|
||||
content: str,
|
||||
chunk_tokens: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
) -> list[MemoryChunk]:
|
||||
"""
|
||||
Split markdown content into overlapping chunks.
|
||||
Direct port of OpenClaw's chunkMarkdown() from internal.ts.
|
||||
|
||||
Uses character-based approximation: ~4 chars per token.
|
||||
"""
|
||||
lines = content.split("\n")
|
||||
if not lines:
|
||||
return []
|
||||
|
||||
max_chars = max(32, chunk_tokens * 4)
|
||||
overlap_chars = max(0, chunk_overlap * 4)
|
||||
chunks: list[MemoryChunk] = []
|
||||
|
||||
current: list[tuple[str, int]] = [] # (line_text, 1-indexed line_no)
|
||||
current_chars = 0
|
||||
|
||||
def flush() -> None:
|
||||
nonlocal current, current_chars
|
||||
if not current:
|
||||
return
|
||||
text = "\n".join(line for line, _ in current)
|
||||
start_line = current[0][1]
|
||||
end_line = current[-1][1]
|
||||
chunks.append(MemoryChunk(
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
text=text,
|
||||
hash=hash_text(text),
|
||||
))
|
||||
|
||||
def carry_overlap() -> None:
|
||||
nonlocal current, current_chars
|
||||
if overlap_chars <= 0 or not current:
|
||||
current = []
|
||||
current_chars = 0
|
||||
return
|
||||
acc = 0
|
||||
kept: list[tuple[str, int]] = []
|
||||
for line_text, line_no in reversed(current):
|
||||
acc += len(line_text) + 1
|
||||
kept.insert(0, (line_text, line_no))
|
||||
if acc >= overlap_chars:
|
||||
break
|
||||
current = kept
|
||||
current_chars = sum(len(lt) + 1 for lt, _ in kept)
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_no = i + 1
|
||||
# Handle very long lines by splitting into segments
|
||||
segments = [""] if not line else [
|
||||
line[start:start + max_chars]
|
||||
for start in range(0, len(line), max_chars)
|
||||
]
|
||||
for segment in segments:
|
||||
line_size = len(segment) + 1
|
||||
if current_chars + line_size > max_chars and current:
|
||||
flush()
|
||||
carry_overlap()
|
||||
current.append((segment, line_no))
|
||||
current_chars += line_size
|
||||
|
||||
flush()
|
||||
return chunks
|
||||
|
||||
|
||||
def list_memory_files(
|
||||
workspace_dir: str,
|
||||
extra_paths: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
List all markdown files in the workspace memory directory.
|
||||
Port of OpenClaw's listMemoryFiles() from internal.ts.
|
||||
|
||||
Searches for:
|
||||
- MEMORY.md (or memory.md) in workspace root
|
||||
- All .md files in memory/ subdirectory
|
||||
- Any additional paths specified
|
||||
"""
|
||||
result: list[str] = []
|
||||
ws = Path(workspace_dir).expanduser().resolve()
|
||||
|
||||
# Check MEMORY.md and memory.md in workspace root
|
||||
for name in ("MEMORY.md", "memory.md"):
|
||||
candidate = ws / name
|
||||
if candidate.is_file() and not candidate.is_symlink():
|
||||
result.append(str(candidate))
|
||||
|
||||
# Check SOUL.md and USER.md (identity files)
|
||||
for name in ("SOUL.md", "USER.md"):
|
||||
candidate = ws / name
|
||||
if candidate.is_file() and not candidate.is_symlink():
|
||||
result.append(str(candidate))
|
||||
|
||||
# Walk memory/ subdirectory
|
||||
memory_dir = ws / "memory"
|
||||
if memory_dir.is_dir() and not memory_dir.is_symlink():
|
||||
_walk_md_files(memory_dir, result)
|
||||
|
||||
# Extra paths
|
||||
if extra_paths:
|
||||
for extra in extra_paths:
|
||||
p = Path(extra).expanduser().resolve()
|
||||
if p.is_symlink():
|
||||
continue
|
||||
if p.is_dir():
|
||||
_walk_md_files(p, result)
|
||||
elif p.is_file() and p.suffix == ".md":
|
||||
result.append(str(p))
|
||||
|
||||
# Deduplicate by resolved path
|
||||
seen: set[str] = set()
|
||||
deduped: list[str] = []
|
||||
for entry in result:
|
||||
real = os.path.realpath(entry)
|
||||
if real not in seen:
|
||||
seen.add(real)
|
||||
deduped.append(entry)
|
||||
|
||||
return deduped
|
||||
|
||||
|
||||
def _walk_md_files(directory: Path, result: list[str]) -> None:
|
||||
"""Recursively collect .md files from a directory."""
|
||||
try:
|
||||
for entry in sorted(directory.iterdir()):
|
||||
if entry.is_symlink():
|
||||
continue
|
||||
if entry.is_dir():
|
||||
_walk_md_files(entry, result)
|
||||
elif entry.is_file() and entry.suffix == ".md":
|
||||
result.append(str(entry))
|
||||
except PermissionError:
|
||||
pass
|
||||
|
||||
|
||||
def build_file_entry(abs_path: str, workspace_dir: str) -> MemoryFileEntry:
|
||||
"""
|
||||
Create a MemoryFileEntry from a file path.
|
||||
Port of OpenClaw's buildFileEntry() from internal.ts.
|
||||
"""
|
||||
stat = os.stat(abs_path)
|
||||
with open(abs_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
content_hash = hash_text(content)
|
||||
rel_path = os.path.relpath(abs_path, workspace_dir).replace("\\", "/")
|
||||
return MemoryFileEntry(
|
||||
path=rel_path,
|
||||
abs_path=abs_path,
|
||||
mtime_ms=stat.st_mtime * 1000,
|
||||
size=stat.st_size,
|
||||
hash=content_hash,
|
||||
)
|
||||
|
||||
|
||||
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||
"""
|
||||
Compute cosine similarity between two vectors.
|
||||
Port of OpenClaw's cosineSimilarity() from internal.ts.
|
||||
"""
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
length = min(len(a), len(b))
|
||||
dot = 0.0
|
||||
norm_a = 0.0
|
||||
norm_b = 0.0
|
||||
for i in range(length):
|
||||
av = a[i]
|
||||
bv = b[i]
|
||||
dot += av * bv
|
||||
norm_a += av * av
|
||||
norm_b += bv * bv
|
||||
if norm_a == 0.0 or norm_b == 0.0:
|
||||
return 0.0
|
||||
return dot / (norm_a ** 0.5 * norm_b ** 0.5)
|
||||
|
||||
|
||||
def normalize_embedding(vec: list[float]) -> list[float]:
|
||||
"""
|
||||
L2-normalize an embedding vector.
|
||||
Port of OpenClaw's sanitizeAndNormalizeEmbedding().
|
||||
"""
|
||||
sanitized = [v if isinstance(v, (int, float)) and v == v else 0.0 for v in vec]
|
||||
magnitude = sum(v * v for v in sanitized) ** 0.5
|
||||
if magnitude < 1e-10:
|
||||
return sanitized
|
||||
return [v / magnitude for v in sanitized]
|
||||
839
memory/manager.py
Normal file
839
memory/manager.py
Normal file
@@ -0,0 +1,839 @@
|
||||
"""
|
||||
MemoryManager — the main memory system orchestrator.
|
||||
Port of OpenClaw's MemoryIndexManager (src/memory/manager.ts, 2,300 LOC).
|
||||
|
||||
Lifecycle: sync → chunk → embed → store → search
|
||||
|
||||
Key features:
|
||||
• Incremental sync — only re-indexes changed files (hash-based)
|
||||
• Hybrid search — vector (0.7) + BM25 keyword (0.3)
|
||||
• File watching — auto re-index on workspace changes (via watchdog)
|
||||
• Embedding cache — avoids re-computing embeddings for unchanged chunks
|
||||
• Session log indexing — indexes daily/ conversation transcripts
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from memory.embeddings import embed_batch, embed_query, get_embedding_dims
|
||||
from memory.hybrid import bm25_rank_to_score, build_fts_query, merge_hybrid_results
|
||||
from memory.internal import (
|
||||
build_file_entry,
|
||||
chunk_markdown,
|
||||
hash_text,
|
||||
list_memory_files,
|
||||
)
|
||||
from memory.schema import ensure_schema
|
||||
from memory.types import (
|
||||
MemoryConfig,
|
||||
MemorySearchResult,
|
||||
MemorySource,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("aetheel.memory")
|
||||
|
||||
SNIPPET_MAX_CHARS = 700
|
||||
|
||||
|
||||
class MemoryManager:
|
||||
"""
|
||||
Main memory system — manages the full lifecycle:
|
||||
sync → chunk → embed → store → search
|
||||
|
||||
Inspired by OpenClaw's MemoryIndexManager.
|
||||
"""
|
||||
|
||||
def __init__(self, config: MemoryConfig | None = None):
|
||||
self._config = config or MemoryConfig()
|
||||
self._workspace_dir = str(
|
||||
Path(self._config.workspace_dir).expanduser().resolve()
|
||||
)
|
||||
self._db_path = str(Path(self._config.db_path).expanduser().resolve())
|
||||
self._sessions_dir = (
|
||||
str(Path(self._config.sessions_dir).expanduser().resolve())
|
||||
if self._config.sessions_dir
|
||||
else os.path.join(self._workspace_dir, "daily")
|
||||
)
|
||||
|
||||
# Ensure directories exist
|
||||
os.makedirs(self._workspace_dir, exist_ok=True)
|
||||
os.makedirs(self._sessions_dir, exist_ok=True)
|
||||
os.makedirs(os.path.dirname(self._db_path), exist_ok=True)
|
||||
|
||||
# Open database and ensure schema
|
||||
self._db = sqlite3.connect(self._db_path, check_same_thread=False)
|
||||
self._db.row_factory = sqlite3.Row
|
||||
schema_result = ensure_schema(self._db, fts_enabled=True)
|
||||
self._fts_available = schema_result.get("fts_available", False)
|
||||
|
||||
# State
|
||||
self._dirty = True
|
||||
self._syncing = False
|
||||
self._sync_lock = threading.Lock()
|
||||
self._closed = False
|
||||
self._watcher = None
|
||||
|
||||
# Create default identity files if they don't exist
|
||||
self._ensure_identity_files()
|
||||
|
||||
logger.info(
|
||||
f"MemoryManager initialized: workspace={self._workspace_dir}, "
|
||||
f"db={self._db_path}, fts={self._fts_available}"
|
||||
)
|
||||
|
||||
# ── Identity File Bootstrap ──────────────────────────────────────
|
||||
|
||||
def _ensure_identity_files(self) -> None:
|
||||
"""Create default SOUL.md, USER.md, MEMORY.md if they don't exist."""
|
||||
defaults = {
|
||||
"SOUL.md": (
|
||||
"# SOUL.md — Who You Are\n\n"
|
||||
"_You're not a chatbot. You're becoming someone._\n\n"
|
||||
"## Core Truths\n\n"
|
||||
"**Be genuinely helpful, not performatively helpful.** "
|
||||
"Skip the filler — just help.\n\n"
|
||||
"**Have opinions.** You're allowed to disagree, prefer things, "
|
||||
"find stuff amusing or boring.\n\n"
|
||||
"**Be resourceful before asking.** Try to figure it out first. "
|
||||
"Then ask if you're stuck.\n\n"
|
||||
"**Earn trust through competence.** Be careful with external actions. "
|
||||
"Be bold with internal ones.\n\n"
|
||||
"## Boundaries\n\n"
|
||||
"- Private things stay private. Period.\n"
|
||||
"- When in doubt, ask before acting externally.\n"
|
||||
"- Never send half-baked replies.\n\n"
|
||||
"## Continuity\n\n"
|
||||
"Each session, you wake up fresh. These files _are_ your memory. "
|
||||
"Read them. Update them. They're how you persist.\n\n"
|
||||
"---\n\n"
|
||||
"_This file is yours to evolve. As you learn who you are, update it._\n"
|
||||
),
|
||||
"USER.md": (
|
||||
"# USER.md — Who I Am\n\n"
|
||||
"## About Me\n\n"
|
||||
"<!-- Fill in your details -->\n"
|
||||
"- **Name:** \n"
|
||||
"- **Role:** \n"
|
||||
"- **Location:** \n"
|
||||
"- **Timezone:** \n\n"
|
||||
"## Preferences\n\n"
|
||||
"<!-- How you like to communicate -->\n"
|
||||
"- **Communication style:** \n"
|
||||
"- **Response length:** \n"
|
||||
"- **Technical level:** \n\n"
|
||||
"## Current Focus\n\n"
|
||||
"<!-- What you're working on -->\n\n"
|
||||
"## Tools & Services\n\n"
|
||||
"<!-- Services you use regularly -->\n\n"
|
||||
"---\n\n"
|
||||
"_Update this file as your preferences evolve._\n"
|
||||
),
|
||||
"MEMORY.md": (
|
||||
"# MEMORY.md — Long-Term Memory\n\n"
|
||||
"## Decisions & Lessons\n\n"
|
||||
"<!-- Record important decisions and lessons learned -->\n\n"
|
||||
"## Context\n\n"
|
||||
"<!-- Persistent context that should carry across sessions -->\n\n"
|
||||
"## Notes\n\n"
|
||||
"<!-- Anything worth remembering -->\n\n"
|
||||
"---\n\n"
|
||||
"_This file persists across sessions. "
|
||||
"Update it when you learn something important._\n"
|
||||
),
|
||||
}
|
||||
|
||||
for filename, content in defaults.items():
|
||||
filepath = os.path.join(self._workspace_dir, filename)
|
||||
if not os.path.exists(filepath):
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
logger.info(f"Created default identity file: {filepath}")
|
||||
|
||||
# ── Search ───────────────────────────────────────────────────────
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
max_results: int | None = None,
|
||||
min_score: float | None = None,
|
||||
) -> list[MemorySearchResult]:
|
||||
"""
|
||||
Search memory using hybrid vector + keyword search.
|
||||
Port of OpenClaw's MemoryIndexManager.search().
|
||||
|
||||
Steps:
|
||||
1. (Optional) Trigger sync if dirty
|
||||
2. Run FTS5 keyword search → BM25 scored
|
||||
3. Generate query embedding → vector search
|
||||
4. Merge results with weighted scoring (0.7v + 0.3k)
|
||||
5. Filter by min_score and return top-N results
|
||||
"""
|
||||
# Auto-sync if dirty
|
||||
if self._config.sync_on_search and self._dirty:
|
||||
await self.sync()
|
||||
|
||||
cleaned = query.strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
|
||||
max_r = max_results or self._config.max_results
|
||||
min_s = min_score if min_score is not None else self._config.min_score
|
||||
candidates = min(200, max(1, max_r * 3))
|
||||
|
||||
# Keyword search (BM25)
|
||||
keyword_results = self._search_keyword(cleaned, candidates)
|
||||
|
||||
# Vector search
|
||||
try:
|
||||
query_vec = embed_query(cleaned, self._config.embedding_model)
|
||||
has_vector = any(v != 0 for v in query_vec)
|
||||
except Exception as e:
|
||||
logger.warning(f"Embedding failed, falling back to keyword-only: {e}")
|
||||
query_vec = []
|
||||
has_vector = False
|
||||
|
||||
vector_results = (
|
||||
self._search_vector(query_vec, candidates) if has_vector else []
|
||||
)
|
||||
|
||||
# If no keyword results, return vector-only
|
||||
if not keyword_results:
|
||||
return [
|
||||
r for r in self._vector_to_search_results(vector_results)
|
||||
if r.score >= min_s
|
||||
][:max_r]
|
||||
|
||||
# Merge hybrid results
|
||||
merged = merge_hybrid_results(
|
||||
vector=vector_results,
|
||||
keyword=keyword_results,
|
||||
vector_weight=self._config.vector_weight,
|
||||
text_weight=self._config.text_weight,
|
||||
)
|
||||
|
||||
return [r for r in merged if r.score >= min_s][:max_r]
|
||||
|
||||
def _search_vector(
|
||||
self, query_vec: list[float], limit: int
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Search chunks by vector cosine similarity.
|
||||
Uses embedding stored as JSON in the chunks table.
|
||||
"""
|
||||
if not query_vec:
|
||||
return []
|
||||
|
||||
try:
|
||||
rows = self._db.execute(
|
||||
"SELECT id, path, start_line, end_line, source, text, embedding "
|
||||
"FROM chunks ORDER BY rowid"
|
||||
).fetchall()
|
||||
except Exception as e:
|
||||
logger.warning(f"Vector search failed: {e}")
|
||||
return []
|
||||
|
||||
from memory.internal import cosine_similarity
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
try:
|
||||
stored_vec = json.loads(row["embedding"])
|
||||
if not stored_vec:
|
||||
continue
|
||||
score = cosine_similarity(query_vec, stored_vec)
|
||||
snippet = row["text"][:SNIPPET_MAX_CHARS]
|
||||
results.append({
|
||||
"id": row["id"],
|
||||
"path": row["path"],
|
||||
"start_line": row["start_line"],
|
||||
"end_line": row["end_line"],
|
||||
"source": row["source"],
|
||||
"snippet": snippet,
|
||||
"vector_score": max(0.0, score),
|
||||
})
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
|
||||
results.sort(key=lambda r: r["vector_score"], reverse=True)
|
||||
return results[:limit]
|
||||
|
||||
def _search_keyword(self, query: str, limit: int) -> list[dict]:
|
||||
"""
|
||||
Search chunks using FTS5 full-text search with BM25 ranking.
|
||||
Port of OpenClaw's searchKeyword().
|
||||
"""
|
||||
if not self._fts_available:
|
||||
return []
|
||||
|
||||
fts_query = build_fts_query(query)
|
||||
if not fts_query:
|
||||
return []
|
||||
|
||||
try:
|
||||
rows = self._db.execute(
|
||||
"SELECT id, path, start_line, end_line, source, text, "
|
||||
"rank AS bm25_rank "
|
||||
"FROM chunks_fts "
|
||||
"WHERE chunks_fts MATCH ? "
|
||||
"ORDER BY rank "
|
||||
"LIMIT ?",
|
||||
(fts_query, limit),
|
||||
).fetchall()
|
||||
except Exception as e:
|
||||
logger.debug(f"FTS search failed for query '{fts_query}': {e}")
|
||||
return []
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
# FTS5 rank is negative (lower = better), convert to 0-1 score
|
||||
bm25_rank = abs(row["bm25_rank"]) if row["bm25_rank"] else 999.0
|
||||
text_score = bm25_rank_to_score(bm25_rank)
|
||||
snippet = row["text"][:SNIPPET_MAX_CHARS]
|
||||
results.append({
|
||||
"id": row["id"],
|
||||
"path": row["path"],
|
||||
"start_line": row["start_line"],
|
||||
"end_line": row["end_line"],
|
||||
"source": row["source"],
|
||||
"snippet": snippet,
|
||||
"text_score": text_score,
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def _vector_to_search_results(
|
||||
self, vector_results: list[dict]
|
||||
) -> list[MemorySearchResult]:
|
||||
"""Convert raw vector results to MemorySearchResult objects."""
|
||||
return [
|
||||
MemorySearchResult(
|
||||
path=r["path"],
|
||||
start_line=r["start_line"],
|
||||
end_line=r["end_line"],
|
||||
score=r["vector_score"],
|
||||
snippet=r["snippet"],
|
||||
source=MemorySource(r["source"]),
|
||||
)
|
||||
for r in vector_results
|
||||
]
|
||||
|
||||
# ── Sync ─────────────────────────────────────────────────────────
|
||||
|
||||
async def sync(self, *, force: bool = False) -> dict:
|
||||
"""
|
||||
Synchronize workspace markdown files into the index.
|
||||
Port of OpenClaw's MemoryIndexManager.sync().
|
||||
|
||||
Steps:
|
||||
1. List all memory files (SOUL.md, USER.md, MEMORY.md, memory/*)
|
||||
2. For each file, check if content hash has changed
|
||||
3. If changed: chunk → embed → store in DB
|
||||
4. Remove stale entries for deleted files
|
||||
5. Optionally sync session logs from daily/
|
||||
|
||||
Returns a summary dict with counts.
|
||||
"""
|
||||
if self._syncing and not force:
|
||||
logger.debug("Sync already in progress, skipping")
|
||||
return {"skipped": True}
|
||||
|
||||
with self._sync_lock:
|
||||
self._syncing = True
|
||||
try:
|
||||
return self._run_sync(force=force)
|
||||
finally:
|
||||
self._syncing = False
|
||||
self._dirty = False
|
||||
|
||||
def _run_sync(self, *, force: bool = False) -> dict:
|
||||
"""Execute the actual sync logic."""
|
||||
stats = {
|
||||
"files_found": 0,
|
||||
"files_indexed": 0,
|
||||
"files_skipped": 0,
|
||||
"chunks_created": 0,
|
||||
"stale_removed": 0,
|
||||
"sessions_indexed": 0,
|
||||
}
|
||||
|
||||
# ── Memory files ──
|
||||
if "memory" in self._config.sources:
|
||||
files = list_memory_files(self._workspace_dir)
|
||||
stats["files_found"] = len(files)
|
||||
|
||||
active_paths: set[str] = set()
|
||||
|
||||
for abs_path in files:
|
||||
entry = build_file_entry(abs_path, self._workspace_dir)
|
||||
active_paths.add(entry.path)
|
||||
|
||||
# Check if file has changed
|
||||
row = self._db.execute(
|
||||
"SELECT hash FROM files WHERE path = ? AND source = ?",
|
||||
(entry.path, MemorySource.MEMORY.value),
|
||||
).fetchone()
|
||||
|
||||
if not force and row and row["hash"] == entry.hash:
|
||||
stats["files_skipped"] += 1
|
||||
continue
|
||||
|
||||
# File is new or changed — re-index it
|
||||
self._index_file(entry, MemorySource.MEMORY)
|
||||
stats["files_indexed"] += 1
|
||||
|
||||
# Remove stale entries for deleted files
|
||||
stale_rows = self._db.execute(
|
||||
"SELECT path FROM files WHERE source = ?",
|
||||
(MemorySource.MEMORY.value,),
|
||||
).fetchall()
|
||||
for stale in stale_rows:
|
||||
if stale["path"] not in active_paths:
|
||||
self._remove_file(stale["path"], MemorySource.MEMORY)
|
||||
stats["stale_removed"] += 1
|
||||
|
||||
# ── Session files ──
|
||||
if "sessions" in self._config.sources:
|
||||
session_count = self._sync_session_files(force=force)
|
||||
stats["sessions_indexed"] = session_count
|
||||
|
||||
# Count total chunks
|
||||
row = self._db.execute("SELECT COUNT(*) as c FROM chunks").fetchone()
|
||||
stats["chunks_created"] = row["c"] if row else 0
|
||||
|
||||
self._db.commit()
|
||||
|
||||
logger.info(
|
||||
f"Sync complete: {stats['files_indexed']} indexed, "
|
||||
f"{stats['files_skipped']} unchanged, "
|
||||
f"{stats['stale_removed']} removed, "
|
||||
f"{stats['chunks_created']} total chunks"
|
||||
)
|
||||
return stats
|
||||
|
||||
def _index_file(self, entry, source: MemorySource) -> None:
|
||||
"""
|
||||
Index a single file: read → chunk → embed → store.
|
||||
Port of OpenClaw's indexFile method.
|
||||
"""
|
||||
try:
|
||||
with open(entry.abs_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read {entry.abs_path}: {e}")
|
||||
return
|
||||
|
||||
if not content.strip():
|
||||
return
|
||||
|
||||
# Chunk the content
|
||||
chunks = chunk_markdown(
|
||||
content,
|
||||
chunk_tokens=self._config.chunk_tokens,
|
||||
chunk_overlap=self._config.chunk_overlap,
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
return
|
||||
|
||||
# Check embedding cache and compute new embeddings
|
||||
texts_to_embed = []
|
||||
chunk_hashes = []
|
||||
cached_embeddings: dict[str, list[float]] = {}
|
||||
|
||||
for chunk in chunks:
|
||||
# Check cache first
|
||||
cache_row = self._db.execute(
|
||||
"SELECT embedding FROM embedding_cache WHERE model = ? AND hash = ?",
|
||||
(self._config.embedding_model, chunk.hash),
|
||||
).fetchone()
|
||||
|
||||
if cache_row:
|
||||
cached_embeddings[chunk.hash] = json.loads(cache_row["embedding"])
|
||||
else:
|
||||
texts_to_embed.append(chunk.text)
|
||||
chunk_hashes.append(chunk.hash)
|
||||
|
||||
# Batch embed uncached chunks
|
||||
new_embeddings: dict[str, list[float]] = {}
|
||||
if texts_to_embed:
|
||||
try:
|
||||
vectors = embed_batch(texts_to_embed, self._config.embedding_model)
|
||||
now = int(time.time())
|
||||
for i, chunk_hash in enumerate(chunk_hashes):
|
||||
vec = vectors[i] if i < len(vectors) else []
|
||||
new_embeddings[chunk_hash] = vec
|
||||
# Store in cache
|
||||
self._db.execute(
|
||||
"INSERT OR REPLACE INTO embedding_cache "
|
||||
"(model, hash, embedding, dims, updated_at) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(
|
||||
self._config.embedding_model,
|
||||
chunk_hash,
|
||||
json.dumps(vec),
|
||||
len(vec),
|
||||
now,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Embedding batch failed for {entry.path}: {e}")
|
||||
# Fall back to empty embeddings
|
||||
for chunk_hash in chunk_hashes:
|
||||
new_embeddings[chunk_hash] = []
|
||||
|
||||
# Remove old chunks for this file
|
||||
self._remove_file_chunks(entry.path, source)
|
||||
|
||||
# Insert new chunks
|
||||
now = int(time.time())
|
||||
for chunk in chunks:
|
||||
chunk_id = str(uuid.uuid4())
|
||||
embedding = cached_embeddings.get(chunk.hash) or new_embeddings.get(
|
||||
chunk.hash, []
|
||||
)
|
||||
|
||||
self._db.execute(
|
||||
"INSERT INTO chunks "
|
||||
"(id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
chunk_id,
|
||||
entry.path,
|
||||
source.value,
|
||||
chunk.start_line,
|
||||
chunk.end_line,
|
||||
chunk.hash,
|
||||
self._config.embedding_model,
|
||||
chunk.text,
|
||||
json.dumps(embedding),
|
||||
now,
|
||||
),
|
||||
)
|
||||
|
||||
# Insert into FTS index
|
||||
if self._fts_available:
|
||||
try:
|
||||
self._db.execute(
|
||||
"INSERT INTO chunks_fts "
|
||||
"(text, id, path, source, model, start_line, end_line) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
chunk.text,
|
||||
chunk_id,
|
||||
entry.path,
|
||||
source.value,
|
||||
self._config.embedding_model,
|
||||
chunk.start_line,
|
||||
chunk.end_line,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"FTS insert failed for chunk {chunk_id}: {e}")
|
||||
|
||||
# Update files table
|
||||
self._db.execute(
|
||||
"INSERT OR REPLACE INTO files (path, source, hash, mtime, size) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(
|
||||
entry.path,
|
||||
source.value,
|
||||
entry.hash,
|
||||
int(entry.mtime_ms),
|
||||
entry.size,
|
||||
),
|
||||
)
|
||||
|
||||
def _remove_file_chunks(self, path: str, source: MemorySource) -> None:
|
||||
"""Remove all chunks (and FTS entries) for a given file."""
|
||||
# Get chunk IDs for FTS cleanup
|
||||
if self._fts_available:
|
||||
chunk_ids = self._db.execute(
|
||||
"SELECT id FROM chunks WHERE path = ? AND source = ?",
|
||||
(path, source.value),
|
||||
).fetchall()
|
||||
for row in chunk_ids:
|
||||
try:
|
||||
self._db.execute(
|
||||
"DELETE FROM chunks_fts WHERE id = ?", (row["id"],)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._db.execute(
|
||||
"DELETE FROM chunks WHERE path = ? AND source = ?",
|
||||
(path, source.value),
|
||||
)
|
||||
|
||||
def _remove_file(self, path: str, source: MemorySource) -> None:
|
||||
"""Remove a file and all its chunks from the index."""
|
||||
self._remove_file_chunks(path, source)
|
||||
self._db.execute(
|
||||
"DELETE FROM files WHERE path = ? AND source = ?",
|
||||
(path, source.value),
|
||||
)
|
||||
|
||||
# ── Session Logs ─────────────────────────────────────────────────
|
||||
|
||||
def _sync_session_files(self, *, force: bool = False) -> int:
|
||||
"""
|
||||
Sync session log files from the daily/ directory.
|
||||
Returns the number of session files indexed.
|
||||
"""
|
||||
sessions_dir = Path(self._sessions_dir)
|
||||
if not sessions_dir.is_dir():
|
||||
return 0
|
||||
|
||||
indexed = 0
|
||||
active_paths: set[str] = set()
|
||||
|
||||
for md_file in sorted(sessions_dir.glob("*.md")):
|
||||
if md_file.is_symlink() or not md_file.is_file():
|
||||
continue
|
||||
entry = build_file_entry(str(md_file), self._workspace_dir)
|
||||
active_paths.add(entry.path)
|
||||
|
||||
# Check if changed
|
||||
row = self._db.execute(
|
||||
"SELECT hash FROM files WHERE path = ? AND source = ?",
|
||||
(entry.path, MemorySource.SESSIONS.value),
|
||||
).fetchone()
|
||||
|
||||
if not force and row and row["hash"] == entry.hash:
|
||||
continue
|
||||
|
||||
self._index_file(entry, MemorySource.SESSIONS)
|
||||
indexed += 1
|
||||
|
||||
# Clean stale session entries
|
||||
stale_rows = self._db.execute(
|
||||
"SELECT path FROM files WHERE source = ?",
|
||||
(MemorySource.SESSIONS.value,),
|
||||
).fetchall()
|
||||
for stale in stale_rows:
|
||||
if stale["path"] not in active_paths:
|
||||
self._remove_file(stale["path"], MemorySource.SESSIONS)
|
||||
|
||||
return indexed
|
||||
|
||||
def log_session(
|
||||
self,
|
||||
content: str,
|
||||
*,
|
||||
date: str | None = None,
|
||||
channel: str = "slack",
|
||||
) -> str:
|
||||
"""
|
||||
Append to today's session log in daily/.
|
||||
|
||||
Args:
|
||||
content: The text to log (e.g., a user message or AI response).
|
||||
date: Optional date string (YYYY-MM-DD). Defaults to today.
|
||||
channel: Channel the conversation came from.
|
||||
|
||||
Returns:
|
||||
Path to the session log file.
|
||||
"""
|
||||
if date is None:
|
||||
date = time.strftime("%Y-%m-%d")
|
||||
|
||||
log_path = os.path.join(self._sessions_dir, f"{date}.md")
|
||||
|
||||
# Create file with header if it doesn't exist
|
||||
if not os.path.exists(log_path):
|
||||
header = f"# Session Log — {date}\n\n"
|
||||
with open(log_path, "w", encoding="utf-8") as f:
|
||||
f.write(header)
|
||||
|
||||
# Append the content
|
||||
timestamp = time.strftime("%H:%M:%S")
|
||||
with open(log_path, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n---\n\n**[{timestamp}] ({channel})**\n\n{content}\n")
|
||||
|
||||
# Mark as dirty for next sync
|
||||
self._dirty = True
|
||||
|
||||
return log_path
|
||||
|
||||
# ── Identity File Access ─────────────────────────────────────────
|
||||
|
||||
def read_identity_file(self, name: str) -> str | None:
|
||||
"""Read an identity file (SOUL.md, USER.md, MEMORY.md)."""
|
||||
filepath = os.path.join(self._workspace_dir, name)
|
||||
if not os.path.isfile(filepath):
|
||||
return None
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def update_identity_file(self, name: str, content: str) -> None:
|
||||
"""Update an identity file and mark index as dirty."""
|
||||
filepath = os.path.join(self._workspace_dir, name)
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
self._dirty = True
|
||||
logger.info(f"Updated identity file: {name}")
|
||||
|
||||
def read_soul(self) -> str | None:
|
||||
return self.read_identity_file("SOUL.md")
|
||||
|
||||
def read_user(self) -> str | None:
|
||||
return self.read_identity_file("USER.md")
|
||||
|
||||
def read_long_term_memory(self) -> str | None:
|
||||
return self.read_identity_file("MEMORY.md")
|
||||
|
||||
def append_to_memory(self, entry: str) -> None:
|
||||
"""Append a new entry to MEMORY.md."""
|
||||
filepath = os.path.join(self._workspace_dir, "MEMORY.md")
|
||||
timestamp = time.strftime("%Y-%m-%d %H:%M")
|
||||
with open(filepath, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n### [{timestamp}]\n\n{entry}\n")
|
||||
self._dirty = True
|
||||
logger.info("Appended to MEMORY.md")
|
||||
|
||||
# ── File Reading ─────────────────────────────────────────────────
|
||||
|
||||
def read_file(
|
||||
self,
|
||||
rel_path: str,
|
||||
*,
|
||||
from_line: int | None = None,
|
||||
num_lines: int | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Read a memory file by relative path.
|
||||
Port of OpenClaw's readFile().
|
||||
"""
|
||||
raw = rel_path.strip()
|
||||
if not raw:
|
||||
raise ValueError("path required")
|
||||
|
||||
if os.path.isabs(raw):
|
||||
abs_path = os.path.realpath(raw)
|
||||
else:
|
||||
abs_path = os.path.realpath(
|
||||
os.path.join(self._workspace_dir, raw)
|
||||
)
|
||||
|
||||
if not abs_path.endswith(".md"):
|
||||
raise ValueError("Only .md files are supported")
|
||||
|
||||
if not os.path.isfile(abs_path):
|
||||
raise FileNotFoundError(f"File not found: {abs_path}")
|
||||
|
||||
with open(abs_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
if from_line is None and num_lines is None:
|
||||
return {"text": content, "path": rel_path}
|
||||
|
||||
lines = content.split("\n")
|
||||
start = max(1, from_line or 1)
|
||||
count = max(1, num_lines or len(lines))
|
||||
sliced = lines[start - 1 : start - 1 + count]
|
||||
return {"text": "\n".join(sliced), "path": rel_path}
|
||||
|
||||
# ── Status ───────────────────────────────────────────────────────
|
||||
|
||||
def status(self) -> dict:
|
||||
"""Get the current status of the memory index."""
|
||||
files_row = self._db.execute(
|
||||
"SELECT COUNT(*) as c FROM files"
|
||||
).fetchone()
|
||||
chunks_row = self._db.execute(
|
||||
"SELECT COUNT(*) as c FROM chunks"
|
||||
).fetchone()
|
||||
cache_row = self._db.execute(
|
||||
"SELECT COUNT(*) as c FROM embedding_cache"
|
||||
).fetchone()
|
||||
|
||||
return {
|
||||
"workspace_dir": self._workspace_dir,
|
||||
"db_path": self._db_path,
|
||||
"sessions_dir": self._sessions_dir,
|
||||
"files": files_row["c"] if files_row else 0,
|
||||
"chunks": chunks_row["c"] if chunks_row else 0,
|
||||
"cached_embeddings": cache_row["c"] if cache_row else 0,
|
||||
"fts_available": self._fts_available,
|
||||
"dirty": self._dirty,
|
||||
"embedding_model": self._config.embedding_model,
|
||||
"embedding_dims": get_embedding_dims(self._config.embedding_model),
|
||||
"vector_weight": self._config.vector_weight,
|
||||
"text_weight": self._config.text_weight,
|
||||
}
|
||||
|
||||
# ── File Watching ────────────────────────────────────────────────
|
||||
|
||||
def start_watching(self) -> None:
|
||||
"""
|
||||
Start watching the workspace for file changes.
|
||||
Uses watchdog for cross-platform file system events.
|
||||
"""
|
||||
if self._watcher or not self._config.watch:
|
||||
return
|
||||
|
||||
try:
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"watchdog not installed — file watching disabled. "
|
||||
"Install with: uv add watchdog"
|
||||
)
|
||||
return
|
||||
|
||||
manager = self
|
||||
|
||||
class MemoryFileHandler(FileSystemEventHandler):
|
||||
def on_any_event(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
src = getattr(event, "src_path", "")
|
||||
if src.endswith(".md"):
|
||||
manager._dirty = True
|
||||
logger.debug(f"Workspace change detected: {src}")
|
||||
|
||||
observer = Observer()
|
||||
handler = MemoryFileHandler()
|
||||
observer.schedule(handler, self._workspace_dir, recursive=True)
|
||||
observer.start()
|
||||
self._watcher = observer
|
||||
logger.info(f"File watching started: {self._workspace_dir}")
|
||||
|
||||
def stop_watching(self) -> None:
|
||||
"""Stop the file watcher."""
|
||||
if self._watcher:
|
||||
self._watcher.stop()
|
||||
self._watcher.join()
|
||||
self._watcher = None
|
||||
logger.info("File watching stopped")
|
||||
|
||||
# ── Lifecycle ────────────────────────────────────────────────────
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the memory manager and release resources."""
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
self.stop_watching()
|
||||
self._db.close()
|
||||
logger.info("MemoryManager closed")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
def __del__(self):
|
||||
if not self._closed:
|
||||
try:
|
||||
self.close()
|
||||
except Exception:
|
||||
pass
|
||||
124
memory/schema.py
Normal file
124
memory/schema.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
SQLite schema for the memory system.
|
||||
Port of OpenClaw's src/memory/memory-schema.ts.
|
||||
|
||||
Tables:
|
||||
• meta — key-value store for index metadata
|
||||
• files — tracked files with content hashes (for incremental sync)
|
||||
• chunks — text chunks with embeddings
|
||||
• chunks_fts — FTS5 virtual table for keyword/BM25 search
|
||||
• chunks_vec — sqlite-vec virtual table for vector similarity (optional)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
|
||||
logger = logging.getLogger("aetheel.memory.schema")
|
||||
|
||||
|
||||
def ensure_schema(
|
||||
db: sqlite3.Connection,
|
||||
*,
|
||||
fts_enabled: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Create all required tables if they don't exist.
|
||||
Returns a dict with 'fts_available' and optionally 'fts_error'.
|
||||
"""
|
||||
# Meta table — stores index config (model, dimensions, etc.)
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
# Files table — tracks which files have been indexed and their content hash
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
path TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'memory',
|
||||
hash TEXT NOT NULL,
|
||||
mtime INTEGER NOT NULL,
|
||||
size INTEGER NOT NULL,
|
||||
PRIMARY KEY (path, source)
|
||||
)
|
||||
""")
|
||||
|
||||
# Chunks table — stores text chunks and their embeddings
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS chunks (
|
||||
id TEXT PRIMARY KEY,
|
||||
path TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'memory',
|
||||
start_line INTEGER NOT NULL,
|
||||
end_line INTEGER NOT NULL,
|
||||
hash TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
embedding TEXT NOT NULL,
|
||||
updated_at INTEGER NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
# Indices for efficient lookups
|
||||
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path)")
|
||||
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source)")
|
||||
db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_hash ON chunks(hash)")
|
||||
|
||||
# FTS5 full-text search table for keyword/BM25 matching
|
||||
fts_available = False
|
||||
fts_error = None
|
||||
if fts_enabled:
|
||||
try:
|
||||
db.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
||||
text,
|
||||
id UNINDEXED,
|
||||
path UNINDEXED,
|
||||
source UNINDEXED,
|
||||
model UNINDEXED,
|
||||
start_line UNINDEXED,
|
||||
end_line UNINDEXED
|
||||
)
|
||||
""")
|
||||
fts_available = True
|
||||
except Exception as e:
|
||||
fts_error = str(e)
|
||||
logger.warning(f"FTS5 unavailable: {fts_error}")
|
||||
|
||||
# Embedding cache table — avoids re-computing embeddings
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS embedding_cache (
|
||||
model TEXT NOT NULL,
|
||||
hash TEXT NOT NULL,
|
||||
embedding TEXT NOT NULL,
|
||||
dims INTEGER,
|
||||
updated_at INTEGER NOT NULL,
|
||||
PRIMARY KEY (model, hash)
|
||||
)
|
||||
""")
|
||||
db.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at "
|
||||
"ON embedding_cache(updated_at)"
|
||||
)
|
||||
|
||||
# Session logs table — tracks daily session transcripts
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS session_logs (
|
||||
session_date TEXT NOT NULL,
|
||||
channel TEXT NOT NULL DEFAULT 'slack',
|
||||
user_id TEXT,
|
||||
summary TEXT,
|
||||
raw_transcript TEXT,
|
||||
created_at INTEGER NOT NULL,
|
||||
PRIMARY KEY (session_date, channel)
|
||||
)
|
||||
""")
|
||||
|
||||
db.commit()
|
||||
|
||||
result = {"fts_available": fts_available}
|
||||
if fts_error:
|
||||
result["fts_error"] = fts_error
|
||||
return result
|
||||
104
memory/types.py
Normal file
104
memory/types.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""
|
||||
Memory system types — mirrors OpenClaw's src/memory/types.ts.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class MemorySource(str, Enum):
|
||||
"""Source of a memory entry — either workspace markdown or session logs."""
|
||||
MEMORY = "memory"
|
||||
SESSIONS = "sessions"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemorySearchResult:
|
||||
"""
|
||||
A single search result from the memory system.
|
||||
Mirrors OpenClaw's MemorySearchResult type.
|
||||
"""
|
||||
path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
score: float
|
||||
snippet: str
|
||||
source: MemorySource
|
||||
citation: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryChunk:
|
||||
"""
|
||||
A chunk of text extracted from a markdown file.
|
||||
Mirrors OpenClaw's MemoryChunk from internal.ts.
|
||||
"""
|
||||
start_line: int
|
||||
end_line: int
|
||||
text: str
|
||||
hash: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryFileEntry:
|
||||
"""
|
||||
Metadata about an indexed markdown file.
|
||||
Mirrors OpenClaw's MemoryFileEntry from internal.ts.
|
||||
"""
|
||||
path: str # relative path within workspace
|
||||
abs_path: str # absolute filesystem path
|
||||
mtime_ms: float # modification time (ms since epoch)
|
||||
size: int # file size in bytes
|
||||
hash: str # SHA-256 of file content
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionFileEntry:
|
||||
"""
|
||||
Metadata about an indexed session transcript file.
|
||||
Mirrors OpenClaw's SessionFileEntry from session-files.ts.
|
||||
"""
|
||||
path: str # relative path (sessions/<filename>)
|
||||
abs_path: str # absolute filesystem path
|
||||
mtime_ms: float
|
||||
size: int
|
||||
hash: str
|
||||
content: str # extracted text content
|
||||
line_map: list[int] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryConfig:
|
||||
"""
|
||||
Configuration for the memory system.
|
||||
"""
|
||||
# Workspace directory containing SOUL.md, USER.md, MEMORY.md, etc.
|
||||
workspace_dir: str = "~/.aetheel/workspace"
|
||||
|
||||
# SQLite database path (created automatically)
|
||||
db_path: str = "~/.aetheel/memory.db"
|
||||
|
||||
# Chunking
|
||||
chunk_tokens: int = 512
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Search
|
||||
max_results: int = 10
|
||||
min_score: float = 0.1
|
||||
vector_weight: float = 0.7
|
||||
text_weight: float = 0.3
|
||||
|
||||
# Embedding
|
||||
embedding_model: str = "BAAI/bge-small-en-v1.5"
|
||||
embedding_dims: int = 384
|
||||
|
||||
# Sync
|
||||
watch: bool = True
|
||||
watch_debounce_ms: int = 2000
|
||||
sync_on_search: bool = True
|
||||
|
||||
# Session logs
|
||||
sessions_dir: str | None = None # defaults to workspace_dir/daily/
|
||||
|
||||
# Sources to index
|
||||
sources: list[str] = field(default_factory=lambda: ["memory", "sessions"])
|
||||
Reference in New Issue
Block a user