"""
MemoryManager — the main memory system orchestrator.
Port of OpenClaw's MemoryIndexManager (src/memory/manager.ts, 2,300 LOC).

Lifecycle: sync → chunk → embed → store → search

Key features:
  • Incremental sync — only re-indexes changed files (hash-based)
  • Hybrid search — vector (0.7) + BM25 keyword (0.3)
  • File watching — auto re-index on workspace changes (via watchdog)
  • Embedding cache — avoids re-computing embeddings for unchanged chunks
  • Session log indexing — indexes daily/ conversation transcripts
"""

import json
import logging
import os
import sqlite3
import threading
import time
import uuid
from pathlib import Path

from memory.embeddings import embed_batch, embed_query, get_embedding_dims
from memory.hybrid import bm25_rank_to_score, build_fts_query, merge_hybrid_results
from memory.internal import (
    build_file_entry,
    chunk_markdown,
    hash_text,
    list_memory_files,
)
from memory.schema import ensure_schema
from memory.types import (
    MemoryConfig,
    MemorySearchResult,
    MemorySource,
)

logger = logging.getLogger("aetheel.memory")

SNIPPET_MAX_CHARS = 700


class MemoryManager:
    """
    Main memory system — manages the full lifecycle:
      sync → chunk → embed → store → search

    Inspired by OpenClaw's MemoryIndexManager.
    """

    def __init__(self, config: MemoryConfig | None = None):
        self._config = config or MemoryConfig()
        self._workspace_dir = str(
            Path(self._config.workspace_dir).expanduser().resolve()
        )
        self._db_path = str(Path(self._config.db_path).expanduser().resolve())
        self._sessions_dir = (
            str(Path(self._config.sessions_dir).expanduser().resolve())
            if self._config.sessions_dir
            else os.path.join(self._workspace_dir, "daily")
        )

        # Ensure directories exist
        os.makedirs(self._workspace_dir, exist_ok=True)
        os.makedirs(self._sessions_dir, exist_ok=True)
        os.makedirs(os.path.dirname(self._db_path), exist_ok=True)

        # Open database and ensure schema
        self._db = sqlite3.connect(self._db_path, check_same_thread=False)
        self._db.row_factory = sqlite3.Row
        schema_result = ensure_schema(self._db, fts_enabled=True)
        self._fts_available = schema_result.get("fts_available", False)

        # State
        self._dirty = True
        self._syncing = False
        self._sync_lock = threading.Lock()
        self._closed = False
        self._watcher = None

        # Create default identity files if they don't exist
        self._ensure_identity_files()

        logger.info(
            f"MemoryManager initialized: workspace={self._workspace_dir}, "
            f"db={self._db_path}, fts={self._fts_available}"
        )

    # ── Identity File Bootstrap ──────────────────────────────────────

    def _ensure_identity_files(self) -> None:
        """Create default SOUL.md, USER.md, MEMORY.md if they don't exist."""
        defaults = {
            "SOUL.md": (
                "# SOUL.md — Who You Are\n\n"
                "_You're not a chatbot. You're becoming someone._\n\n"
                "## Core Truths\n\n"
                "**Be genuinely helpful, not performatively helpful.** "
                "Skip the filler — just help.\n\n"
                "**Have opinions.** You're allowed to disagree, prefer things, "
                "find stuff amusing or boring.\n\n"
                "**Be resourceful before asking.** Try to figure it out first. "
                "Then ask if you're stuck.\n\n"
                "**Earn trust through competence.** Be careful with external actions. "
                "Be bold with internal ones.\n\n"
                "## Boundaries\n\n"
                "- Private things stay private. Period.\n"
                "- When in doubt, ask before acting externally.\n"
                "- Never send half-baked replies.\n\n"
                "## Continuity\n\n"
                "Each session, you wake up fresh. These files _are_ your memory. "
                "Read them. Update them. They're how you persist.\n\n"
                "---\n\n"
                "_This file is yours to evolve. As you learn who you are, update it._\n"
            ),
            "USER.md": (
                "# USER.md — Who I Am\n\n"
                "## About Me\n\n"
                "<!-- Fill in your details -->\n"
                "- **Name:** \n"
                "- **Role:** \n"
                "- **Location:** \n"
                "- **Timezone:** \n\n"
                "## Preferences\n\n"
                "<!-- How you like to communicate -->\n"
                "- **Communication style:** \n"
                "- **Response length:** \n"
                "- **Technical level:** \n\n"
                "## Current Focus\n\n"
                "<!-- What you're working on -->\n\n"
                "## Tools & Services\n\n"
                "<!-- Services you use regularly -->\n\n"
                "---\n\n"
                "_Update this file as your preferences evolve._\n"
            ),
            "MEMORY.md": (
                "# MEMORY.md — Long-Term Memory\n\n"
                "## Decisions & Lessons\n\n"
                "<!-- Record important decisions and lessons learned -->\n\n"
                "## Context\n\n"
                "<!-- Persistent context that should carry across sessions -->\n\n"
                "## Notes\n\n"
                "<!-- Anything worth remembering -->\n\n"
                "---\n\n"
                "_This file persists across sessions. "
                "Update it when you learn something important._\n"
            ),
        }

        for filename, content in defaults.items():
            filepath = os.path.join(self._workspace_dir, filename)
            if not os.path.exists(filepath):
                with open(filepath, "w", encoding="utf-8") as f:
                    f.write(content)
                logger.info(f"Created default identity file: {filepath}")

    # ── Search ───────────────────────────────────────────────────────

    async def search(
        self,
        query: str,
        *,
        max_results: int | None = None,
        min_score: float | None = None,
    ) -> list[MemorySearchResult]:
        """
        Search memory using hybrid vector + keyword search.
        Port of OpenClaw's MemoryIndexManager.search().

        Steps:
          1. (Optional) Trigger sync if dirty
          2. Run FTS5 keyword search → BM25 scored
          3. Generate query embedding → vector search
          4. Merge results with weighted scoring (0.7v + 0.3k)
          5. Filter by min_score and return top-N results
        """
        # Auto-sync if dirty
        if self._config.sync_on_search and self._dirty:
            await self.sync()

        cleaned = query.strip()
        if not cleaned:
            return []

        max_r = max_results or self._config.max_results
        min_s = min_score if min_score is not None else self._config.min_score
        candidates = min(200, max(1, max_r * 3))

        # Keyword search (BM25)
        keyword_results = self._search_keyword(cleaned, candidates)

        # Vector search
        try:
            query_vec = embed_query(cleaned, self._config.embedding_model)
            has_vector = any(v != 0 for v in query_vec)
        except Exception as e:
            logger.warning(f"Embedding failed, falling back to keyword-only: {e}")
            query_vec = []
            has_vector = False

        vector_results = (
            self._search_vector(query_vec, candidates) if has_vector else []
        )

        # If no keyword results, return vector-only
        if not keyword_results:
            return [
                r for r in self._vector_to_search_results(vector_results)
                if r.score >= min_s
            ][:max_r]

        # Merge hybrid results
        merged = merge_hybrid_results(
            vector=vector_results,
            keyword=keyword_results,
            vector_weight=self._config.vector_weight,
            text_weight=self._config.text_weight,
        )

        return [r for r in merged if r.score >= min_s][:max_r]

    def _search_vector(
        self, query_vec: list[float], limit: int
    ) -> list[dict]:
        """
        Search chunks by vector cosine similarity.
        Uses embedding stored as JSON in the chunks table.
        """
        if not query_vec:
            return []

        try:
            rows = self._db.execute(
                "SELECT id, path, start_line, end_line, source, text, embedding "
                "FROM chunks ORDER BY rowid"
            ).fetchall()
        except Exception as e:
            logger.warning(f"Vector search failed: {e}")
            return []

        from memory.internal import cosine_similarity

        results = []
        for row in rows:
            try:
                stored_vec = json.loads(row["embedding"])
                if not stored_vec:
                    continue
                score = cosine_similarity(query_vec, stored_vec)
                snippet = row["text"][:SNIPPET_MAX_CHARS]
                results.append({
                    "id": row["id"],
                    "path": row["path"],
                    "start_line": row["start_line"],
                    "end_line": row["end_line"],
                    "source": row["source"],
                    "snippet": snippet,
                    "vector_score": max(0.0, score),
                })
            except (json.JSONDecodeError, TypeError):
                continue

        results.sort(key=lambda r: r["vector_score"], reverse=True)
        return results[:limit]

    def _search_keyword(self, query: str, limit: int) -> list[dict]:
        """
        Search chunks using FTS5 full-text search with BM25 ranking.
        Port of OpenClaw's searchKeyword().
        """
        if not self._fts_available:
            return []

        fts_query = build_fts_query(query)
        if not fts_query:
            return []

        try:
            rows = self._db.execute(
                "SELECT id, path, start_line, end_line, source, text, "
                "rank AS bm25_rank "
                "FROM chunks_fts "
                "WHERE chunks_fts MATCH ? "
                "ORDER BY rank "
                "LIMIT ?",
                (fts_query, limit),
            ).fetchall()
        except Exception as e:
            logger.debug(f"FTS search failed for query '{fts_query}': {e}")
            return []

        results = []
        for row in rows:
            # FTS5 rank is negative (lower = better), convert to 0-1 score
            bm25_rank = abs(row["bm25_rank"]) if row["bm25_rank"] else 999.0
            text_score = bm25_rank_to_score(bm25_rank)
            snippet = row["text"][:SNIPPET_MAX_CHARS]
            results.append({
                "id": row["id"],
                "path": row["path"],
                "start_line": row["start_line"],
                "end_line": row["end_line"],
                "source": row["source"],
                "snippet": snippet,
                "text_score": text_score,
            })

        return results

    def _vector_to_search_results(
        self, vector_results: list[dict]
    ) -> list[MemorySearchResult]:
        """Convert raw vector results to MemorySearchResult objects."""
        return [
            MemorySearchResult(
                path=r["path"],
                start_line=r["start_line"],
                end_line=r["end_line"],
                score=r["vector_score"],
                snippet=r["snippet"],
                source=MemorySource(r["source"]),
            )
            for r in vector_results
        ]

    # ── Sync ─────────────────────────────────────────────────────────

    async def sync(self, *, force: bool = False) -> dict:
        """
        Synchronize workspace markdown files into the index.
        Port of OpenClaw's MemoryIndexManager.sync().

        Steps:
          1. List all memory files (SOUL.md, USER.md, MEMORY.md, memory/*)
          2. For each file, check if content hash has changed
          3. If changed: chunk → embed → store in DB
          4. Remove stale entries for deleted files
          5. Optionally sync session logs from daily/

        Returns a summary dict with counts.
        """
        if self._syncing and not force:
            logger.debug("Sync already in progress, skipping")
            return {"skipped": True}

        with self._sync_lock:
            self._syncing = True
            try:
                return self._run_sync(force=force)
            finally:
                self._syncing = False
                self._dirty = False

    def _run_sync(self, *, force: bool = False) -> dict:
        """Execute the actual sync logic."""
        stats = {
            "files_found": 0,
            "files_indexed": 0,
            "files_skipped": 0,
            "chunks_created": 0,
            "stale_removed": 0,
            "sessions_indexed": 0,
        }

        # ── Memory files ──
        if "memory" in self._config.sources:
            files = list_memory_files(self._workspace_dir)
            stats["files_found"] = len(files)

            active_paths: set[str] = set()

            for abs_path in files:
                entry = build_file_entry(abs_path, self._workspace_dir)
                active_paths.add(entry.path)

                # Check if file has changed
                row = self._db.execute(
                    "SELECT hash FROM files WHERE path = ? AND source = ?",
                    (entry.path, MemorySource.MEMORY.value),
                ).fetchone()

                if not force and row and row["hash"] == entry.hash:
                    stats["files_skipped"] += 1
                    continue

                # File is new or changed — re-index it
                self._index_file(entry, MemorySource.MEMORY)
                stats["files_indexed"] += 1

            # Remove stale entries for deleted files
            stale_rows = self._db.execute(
                "SELECT path FROM files WHERE source = ?",
                (MemorySource.MEMORY.value,),
            ).fetchall()
            for stale in stale_rows:
                if stale["path"] not in active_paths:
                    self._remove_file(stale["path"], MemorySource.MEMORY)
                    stats["stale_removed"] += 1

        # ── Session files ──
        if "sessions" in self._config.sources:
            session_count = self._sync_session_files(force=force)
            stats["sessions_indexed"] = session_count

        # Count total chunks
        row = self._db.execute("SELECT COUNT(*) as c FROM chunks").fetchone()
        stats["chunks_created"] = row["c"] if row else 0

        self._db.commit()

        logger.info(
            f"Sync complete: {stats['files_indexed']} indexed, "
            f"{stats['files_skipped']} unchanged, "
            f"{stats['stale_removed']} removed, "
            f"{stats['chunks_created']} total chunks"
        )
        return stats

    def _index_file(self, entry, source: MemorySource) -> None:
        """
        Index a single file: read → chunk → embed → store.
        Port of OpenClaw's indexFile method.
        """
        try:
            with open(entry.abs_path, "r", encoding="utf-8") as f:
                content = f.read()
        except Exception as e:
            logger.warning(f"Failed to read {entry.abs_path}: {e}")
            return

        if not content.strip():
            return

        # Chunk the content
        chunks = chunk_markdown(
            content,
            chunk_tokens=self._config.chunk_tokens,
            chunk_overlap=self._config.chunk_overlap,
        )

        if not chunks:
            return

        # Check embedding cache and compute new embeddings
        texts_to_embed = []
        chunk_hashes = []
        cached_embeddings: dict[str, list[float]] = {}

        for chunk in chunks:
            # Check cache first
            cache_row = self._db.execute(
                "SELECT embedding FROM embedding_cache WHERE model = ? AND hash = ?",
                (self._config.embedding_model, chunk.hash),
            ).fetchone()

            if cache_row:
                cached_embeddings[chunk.hash] = json.loads(cache_row["embedding"])
            else:
                texts_to_embed.append(chunk.text)
                chunk_hashes.append(chunk.hash)

        # Batch embed uncached chunks
        new_embeddings: dict[str, list[float]] = {}
        if texts_to_embed:
            try:
                vectors = embed_batch(texts_to_embed, self._config.embedding_model)
                now = int(time.time())
                for i, chunk_hash in enumerate(chunk_hashes):
                    vec = vectors[i] if i < len(vectors) else []
                    new_embeddings[chunk_hash] = vec
                    # Store in cache
                    self._db.execute(
                        "INSERT OR REPLACE INTO embedding_cache "
                        "(model, hash, embedding, dims, updated_at) "
                        "VALUES (?, ?, ?, ?, ?)",
                        (
                            self._config.embedding_model,
                            chunk_hash,
                            json.dumps(vec),
                            len(vec),
                            now,
                        ),
                    )
            except Exception as e:
                logger.warning(f"Embedding batch failed for {entry.path}: {e}")
                # Fall back to empty embeddings
                for chunk_hash in chunk_hashes:
                    new_embeddings[chunk_hash] = []

        # Remove old chunks for this file
        self._remove_file_chunks(entry.path, source)

        # Insert new chunks
        now = int(time.time())
        for chunk in chunks:
            chunk_id = str(uuid.uuid4())
            embedding = cached_embeddings.get(chunk.hash) or new_embeddings.get(
                chunk.hash, []
            )

            self._db.execute(
                "INSERT INTO chunks "
                "(id, path, source, start_line, end_line, hash, model, text, embedding, updated_at) "
                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                (
                    chunk_id,
                    entry.path,
                    source.value,
                    chunk.start_line,
                    chunk.end_line,
                    chunk.hash,
                    self._config.embedding_model,
                    chunk.text,
                    json.dumps(embedding),
                    now,
                ),
            )

            # Insert into FTS index
            if self._fts_available:
                try:
                    self._db.execute(
                        "INSERT INTO chunks_fts "
                        "(text, id, path, source, model, start_line, end_line) "
                        "VALUES (?, ?, ?, ?, ?, ?, ?)",
                        (
                            chunk.text,
                            chunk_id,
                            entry.path,
                            source.value,
                            self._config.embedding_model,
                            chunk.start_line,
                            chunk.end_line,
                        ),
                    )
                except Exception as e:
                    logger.debug(f"FTS insert failed for chunk {chunk_id}: {e}")

        # Update files table
        self._db.execute(
            "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) "
            "VALUES (?, ?, ?, ?, ?)",
            (
                entry.path,
                source.value,
                entry.hash,
                int(entry.mtime_ms),
                entry.size,
            ),
        )

    def _remove_file_chunks(self, path: str, source: MemorySource) -> None:
        """Remove all chunks (and FTS entries) for a given file."""
        # Get chunk IDs for FTS cleanup
        if self._fts_available:
            chunk_ids = self._db.execute(
                "SELECT id FROM chunks WHERE path = ? AND source = ?",
                (path, source.value),
            ).fetchall()
            for row in chunk_ids:
                try:
                    self._db.execute(
                        "DELETE FROM chunks_fts WHERE id = ?", (row["id"],)
                    )
                except Exception:
                    pass

        self._db.execute(
            "DELETE FROM chunks WHERE path = ? AND source = ?",
            (path, source.value),
        )

    def _remove_file(self, path: str, source: MemorySource) -> None:
        """Remove a file and all its chunks from the index."""
        self._remove_file_chunks(path, source)
        self._db.execute(
            "DELETE FROM files WHERE path = ? AND source = ?",
            (path, source.value),
        )

    # ── Session Logs ─────────────────────────────────────────────────

    def _sync_session_files(self, *, force: bool = False) -> int:
        """
        Sync session log files from the daily/ directory.
        Returns the number of session files indexed.
        """
        sessions_dir = Path(self._sessions_dir)
        if not sessions_dir.is_dir():
            return 0

        indexed = 0
        active_paths: set[str] = set()

        for md_file in sorted(sessions_dir.glob("*.md")):
            if md_file.is_symlink() or not md_file.is_file():
                continue
            entry = build_file_entry(str(md_file), self._workspace_dir)
            active_paths.add(entry.path)

            # Check if changed
            row = self._db.execute(
                "SELECT hash FROM files WHERE path = ? AND source = ?",
                (entry.path, MemorySource.SESSIONS.value),
            ).fetchone()

            if not force and row and row["hash"] == entry.hash:
                continue

            self._index_file(entry, MemorySource.SESSIONS)
            indexed += 1

        # Clean stale session entries
        stale_rows = self._db.execute(
            "SELECT path FROM files WHERE source = ?",
            (MemorySource.SESSIONS.value,),
        ).fetchall()
        for stale in stale_rows:
            if stale["path"] not in active_paths:
                self._remove_file(stale["path"], MemorySource.SESSIONS)

        return indexed

    def log_session(
        self,
        content: str,
        *,
        date: str | None = None,
        channel: str = "slack",
    ) -> str:
        """
        Append to today's session log in daily/.

        Args:
            content: The text to log (e.g., a user message or AI response).
            date: Optional date string (YYYY-MM-DD). Defaults to today.
            channel: Channel the conversation came from.

        Returns:
            Path to the session log file.
        """
        if date is None:
            date = time.strftime("%Y-%m-%d")

        log_path = os.path.join(self._sessions_dir, f"{date}.md")

        # Create file with header if it doesn't exist
        if not os.path.exists(log_path):
            header = f"# Session Log — {date}\n\n"
            with open(log_path, "w", encoding="utf-8") as f:
                f.write(header)

        # Append the content
        timestamp = time.strftime("%H:%M:%S")
        with open(log_path, "a", encoding="utf-8") as f:
            f.write(f"\n---\n\n**[{timestamp}] ({channel})**\n\n{content}\n")

        # Mark as dirty for next sync
        self._dirty = True

        return log_path

    # ── Identity File Access ─────────────────────────────────────────

    def read_identity_file(self, name: str) -> str | None:
        """Read an identity file (SOUL.md, USER.md, MEMORY.md)."""
        filepath = os.path.join(self._workspace_dir, name)
        if not os.path.isfile(filepath):
            return None
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()

    def update_identity_file(self, name: str, content: str) -> None:
        """Update an identity file and mark index as dirty."""
        filepath = os.path.join(self._workspace_dir, name)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        self._dirty = True
        logger.info(f"Updated identity file: {name}")

    def read_soul(self) -> str | None:
        return self.read_identity_file("SOUL.md")

    def read_user(self) -> str | None:
        return self.read_identity_file("USER.md")

    def read_long_term_memory(self) -> str | None:
        return self.read_identity_file("MEMORY.md")

    def append_to_memory(self, entry: str) -> None:
        """Append a new entry to MEMORY.md."""
        filepath = os.path.join(self._workspace_dir, "MEMORY.md")
        timestamp = time.strftime("%Y-%m-%d %H:%M")
        with open(filepath, "a", encoding="utf-8") as f:
            f.write(f"\n### [{timestamp}]\n\n{entry}\n")
        self._dirty = True
        logger.info("Appended to MEMORY.md")

    # ── File Reading ─────────────────────────────────────────────────

    def read_file(
        self,
        rel_path: str,
        *,
        from_line: int | None = None,
        num_lines: int | None = None,
    ) -> dict:
        """
        Read a memory file by relative path.
        Port of OpenClaw's readFile().
        """
        raw = rel_path.strip()
        if not raw:
            raise ValueError("path required")

        if os.path.isabs(raw):
            abs_path = os.path.realpath(raw)
        else:
            abs_path = os.path.realpath(
                os.path.join(self._workspace_dir, raw)
            )

        if not abs_path.endswith(".md"):
            raise ValueError("Only .md files are supported")

        if not os.path.isfile(abs_path):
            raise FileNotFoundError(f"File not found: {abs_path}")

        with open(abs_path, "r", encoding="utf-8") as f:
            content = f.read()

        if from_line is None and num_lines is None:
            return {"text": content, "path": rel_path}

        lines = content.split("\n")
        start = max(1, from_line or 1)
        count = max(1, num_lines or len(lines))
        sliced = lines[start - 1 : start - 1 + count]
        return {"text": "\n".join(sliced), "path": rel_path}

    # ── Status ───────────────────────────────────────────────────────

    def status(self) -> dict:
        """Get the current status of the memory index."""
        files_row = self._db.execute(
            "SELECT COUNT(*) as c FROM files"
        ).fetchone()
        chunks_row = self._db.execute(
            "SELECT COUNT(*) as c FROM chunks"
        ).fetchone()
        cache_row = self._db.execute(
            "SELECT COUNT(*) as c FROM embedding_cache"
        ).fetchone()

        return {
            "workspace_dir": self._workspace_dir,
            "db_path": self._db_path,
            "sessions_dir": self._sessions_dir,
            "files": files_row["c"] if files_row else 0,
            "chunks": chunks_row["c"] if chunks_row else 0,
            "cached_embeddings": cache_row["c"] if cache_row else 0,
            "fts_available": self._fts_available,
            "dirty": self._dirty,
            "embedding_model": self._config.embedding_model,
            "embedding_dims": get_embedding_dims(self._config.embedding_model),
            "vector_weight": self._config.vector_weight,
            "text_weight": self._config.text_weight,
        }

    # ── File Watching ────────────────────────────────────────────────

    def start_watching(self) -> None:
        """
        Start watching the workspace for file changes.
        Uses watchdog for cross-platform file system events.
        """
        if self._watcher or not self._config.watch:
            return

        try:
            from watchdog.events import FileSystemEventHandler
            from watchdog.observers import Observer
        except ImportError:
            logger.warning(
                "watchdog not installed — file watching disabled. "
                "Install with: uv add watchdog"
            )
            return

        manager = self

        class MemoryFileHandler(FileSystemEventHandler):
            def on_any_event(self, event):
                if event.is_directory:
                    return
                src = getattr(event, "src_path", "")
                if src.endswith(".md"):
                    manager._dirty = True
                    logger.debug(f"Workspace change detected: {src}")

        observer = Observer()
        handler = MemoryFileHandler()
        observer.schedule(handler, self._workspace_dir, recursive=True)
        observer.start()
        self._watcher = observer
        logger.info(f"File watching started: {self._workspace_dir}")

    def stop_watching(self) -> None:
        """Stop the file watcher."""
        if self._watcher:
            self._watcher.stop()
            self._watcher.join()
            self._watcher = None
            logger.info("File watching stopped")

    # ── Lifecycle ────────────────────────────────────────────────────

    def close(self) -> None:
        """Close the memory manager and release resources."""
        if self._closed:
            return
        self._closed = True
        self.stop_watching()
        self._db.close()
        logger.info("MemoryManager closed")

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def __del__(self):
        if not self._closed:
            try:
                self.close()
            except Exception:
                pass