Aetheel/memory/hybrid.py

"""
Hybrid search — merges vector similarity + BM25 keyword results.
Direct port of OpenClaw's src/memory/hybrid.ts.

The algorithm:
  1. Run vector search → ranked by cosine similarity
  2. Run FTS5 keyword search → ranked by BM25
  3. Merge by weighted score: 0.7 × vector + 0.3 × keyword
  4. Deduplicate by chunk ID
  5. Sort by combined score (descending)
"""

import re

from memory.types import MemorySearchResult, MemorySource


def build_fts_query(raw: str) -> str | None:
    """
    Build an FTS5 match query from raw text.
    Port of OpenClaw's buildFtsQuery() — quotes each token
    and joins with AND for a conjunctive match.

    Example: "hello world" → '"hello" AND "world"'
    """
    tokens = re.findall(r"[A-Za-z0-9_]+", raw)
    if not tokens:
        return None
    quoted = [f'"{t}"' for t in tokens]
    return " AND ".join(quoted)


def bm25_rank_to_score(rank: float) -> float:
    """
    Convert FTS5 BM25 rank (negative = better) to a 0-1 score.
    Port of OpenClaw's bm25RankToScore().
    """
    normalized = max(0.0, rank) if isinstance(rank, (int, float)) else 999.0
    return 1.0 / (1.0 + normalized)


def merge_hybrid_results(
    vector: list[dict],
    keyword: list[dict],
    vector_weight: float = 0.7,
    text_weight: float = 0.3,
) -> list[MemorySearchResult]:
    """
    Merge vector and keyword search results with weighted scoring.
    Direct port of OpenClaw's mergeHybridResults() from hybrid.ts.

    Each vector result dict has: id, path, start_line, end_line, source, snippet, vector_score
    Each keyword result dict has: id, path, start_line, end_line, source, snippet, text_score
    """
    by_id: dict[str, dict] = {}

    # Process vector results
    for r in vector:
        by_id[r["id"]] = {
            "id": r["id"],
            "path": r["path"],
            "start_line": r["start_line"],
            "end_line": r["end_line"],
            "source": r["source"],
            "snippet": r["snippet"],
            "vector_score": r.get("vector_score", 0.0),
            "text_score": 0.0,
        }

    # Process keyword results — merge with existing or create new
    for r in keyword:
        existing = by_id.get(r["id"])
        if existing:
            existing["text_score"] = r.get("text_score", 0.0)
            # Prefer keyword snippet if available (often more relevant)
            if r.get("snippet"):
                existing["snippet"] = r["snippet"]
        else:
            by_id[r["id"]] = {
                "id": r["id"],
                "path": r["path"],
                "start_line": r["start_line"],
                "end_line": r["end_line"],
                "source": r["source"],
                "snippet": r["snippet"],
                "vector_score": 0.0,
                "text_score": r.get("text_score", 0.0),
            }

    # Compute weighted score and convert to MemorySearchResult
    merged: list[MemorySearchResult] = []
    for entry in by_id.values():
        score = (
            vector_weight * entry["vector_score"]
            + text_weight * entry["text_score"]
        )
        source = entry["source"]
        if isinstance(source, str):
            source = MemorySource(source)
        merged.append(MemorySearchResult(
            path=entry["path"],
            start_line=entry["start_line"],
            end_line=entry["end_line"],
            score=score,
            snippet=entry["snippet"],
            source=source,
        ))

    # Sort by score descending
    merged.sort(key=lambda r: r.score, reverse=True)
    return merged