first commit
This commit is contained in:
111
memory/hybrid.py
Normal file
111
memory/hybrid.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Hybrid search — merges vector similarity + BM25 keyword results.
|
||||
Direct port of OpenClaw's src/memory/hybrid.ts.
|
||||
|
||||
The algorithm:
|
||||
1. Run vector search → ranked by cosine similarity
|
||||
2. Run FTS5 keyword search → ranked by BM25
|
||||
3. Merge by weighted score: 0.7 × vector + 0.3 × keyword
|
||||
4. Deduplicate by chunk ID
|
||||
5. Sort by combined score (descending)
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from memory.types import MemorySearchResult, MemorySource
|
||||
|
||||
|
||||
def build_fts_query(raw: str) -> str | None:
|
||||
"""
|
||||
Build an FTS5 match query from raw text.
|
||||
Port of OpenClaw's buildFtsQuery() — quotes each token
|
||||
and joins with AND for a conjunctive match.
|
||||
|
||||
Example: "hello world" → '"hello" AND "world"'
|
||||
"""
|
||||
tokens = re.findall(r"[A-Za-z0-9_]+", raw)
|
||||
if not tokens:
|
||||
return None
|
||||
quoted = [f'"{t}"' for t in tokens]
|
||||
return " AND ".join(quoted)
|
||||
|
||||
|
||||
def bm25_rank_to_score(rank: float) -> float:
|
||||
"""
|
||||
Convert FTS5 BM25 rank (negative = better) to a 0-1 score.
|
||||
Port of OpenClaw's bm25RankToScore().
|
||||
"""
|
||||
normalized = max(0.0, rank) if isinstance(rank, (int, float)) else 999.0
|
||||
return 1.0 / (1.0 + normalized)
|
||||
|
||||
|
||||
def merge_hybrid_results(
|
||||
vector: list[dict],
|
||||
keyword: list[dict],
|
||||
vector_weight: float = 0.7,
|
||||
text_weight: float = 0.3,
|
||||
) -> list[MemorySearchResult]:
|
||||
"""
|
||||
Merge vector and keyword search results with weighted scoring.
|
||||
Direct port of OpenClaw's mergeHybridResults() from hybrid.ts.
|
||||
|
||||
Each vector result dict has: id, path, start_line, end_line, source, snippet, vector_score
|
||||
Each keyword result dict has: id, path, start_line, end_line, source, snippet, text_score
|
||||
"""
|
||||
by_id: dict[str, dict] = {}
|
||||
|
||||
# Process vector results
|
||||
for r in vector:
|
||||
by_id[r["id"]] = {
|
||||
"id": r["id"],
|
||||
"path": r["path"],
|
||||
"start_line": r["start_line"],
|
||||
"end_line": r["end_line"],
|
||||
"source": r["source"],
|
||||
"snippet": r["snippet"],
|
||||
"vector_score": r.get("vector_score", 0.0),
|
||||
"text_score": 0.0,
|
||||
}
|
||||
|
||||
# Process keyword results — merge with existing or create new
|
||||
for r in keyword:
|
||||
existing = by_id.get(r["id"])
|
||||
if existing:
|
||||
existing["text_score"] = r.get("text_score", 0.0)
|
||||
# Prefer keyword snippet if available (often more relevant)
|
||||
if r.get("snippet"):
|
||||
existing["snippet"] = r["snippet"]
|
||||
else:
|
||||
by_id[r["id"]] = {
|
||||
"id": r["id"],
|
||||
"path": r["path"],
|
||||
"start_line": r["start_line"],
|
||||
"end_line": r["end_line"],
|
||||
"source": r["source"],
|
||||
"snippet": r["snippet"],
|
||||
"vector_score": 0.0,
|
||||
"text_score": r.get("text_score", 0.0),
|
||||
}
|
||||
|
||||
# Compute weighted score and convert to MemorySearchResult
|
||||
merged: list[MemorySearchResult] = []
|
||||
for entry in by_id.values():
|
||||
score = (
|
||||
vector_weight * entry["vector_score"]
|
||||
+ text_weight * entry["text_score"]
|
||||
)
|
||||
source = entry["source"]
|
||||
if isinstance(source, str):
|
||||
source = MemorySource(source)
|
||||
merged.append(MemorySearchResult(
|
||||
path=entry["path"],
|
||||
start_line=entry["start_line"],
|
||||
end_line=entry["end_line"],
|
||||
score=score,
|
||||
snippet=entry["snippet"],
|
||||
source=source,
|
||||
))
|
||||
|
||||
# Sort by score descending
|
||||
merged.sort(key=lambda r: r.score, reverse=True)
|
||||
return merged
|
||||
Reference in New Issue
Block a user