первый коммит

2026-02-27 21:28:09 +03:00
parent 43c404f958
commit 1bc57a7c25
171 changed files with 6400 additions and 556 deletions
--- a/app/modules/rag_session/repository.py
+++ b/app/modules/rag_session/repository.py
@@ -0,0 +1,660 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+import hashlib
+from sqlalchemy import text
+
+from app.modules.shared.db import get_engine
+
+
+@dataclass
+class RagJobRow:
+    index_job_id: str
+    rag_session_id: str
+    status: str
+    indexed_files: int
+    failed_files: int
+    cache_hit_files: int
+    cache_miss_files: int
+    error_code: str | None
+    error_desc: str | None
+    error_module: str | None
+
+
+class RagRepository:
+    def ensure_tables(self) -> None:
+        engine = get_engine()
+        with engine.connect() as conn:
+            conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_sessions (
+                        rag_session_id VARCHAR(64) PRIMARY KEY,
+                        project_id VARCHAR(512) NOT NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    )
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_index_jobs (
+                        index_job_id VARCHAR(64) PRIMARY KEY,
+                        rag_session_id VARCHAR(64) NOT NULL,
+                        status VARCHAR(16) NOT NULL,
+                        indexed_files INTEGER NOT NULL DEFAULT 0,
+                        failed_files INTEGER NOT NULL DEFAULT 0,
+                        cache_hit_files INTEGER NOT NULL DEFAULT 0,
+                        cache_miss_files INTEGER NOT NULL DEFAULT 0,
+                        error_code VARCHAR(128) NULL,
+                        error_desc TEXT NULL,
+                        error_module VARCHAR(64) NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    )
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_chunks (
+                        id BIGSERIAL PRIMARY KEY,
+                        rag_session_id VARCHAR(64) NOT NULL,
+                        path TEXT NOT NULL,
+                        chunk_index INTEGER NOT NULL,
+                        content TEXT NOT NULL,
+                        embedding vector NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    )
+                    """
+                )
+            )
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL"))
+            conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL"))
+            conn.execute(
+                text(
+                    """
+                    ALTER TABLE rag_chunks
+                    ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    ALTER TABLE rag_chunks
+                    ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_blob_cache (
+                        id BIGSERIAL PRIMARY KEY,
+                        repo_id VARCHAR(512) NOT NULL,
+                        blob_sha VARCHAR(128) NOT NULL,
+                        path TEXT NOT NULL,
+                        artifact_type VARCHAR(16) NULL,
+                        section TEXT NULL,
+                        doc_id TEXT NULL,
+                        doc_version TEXT NULL,
+                        owner TEXT NULL,
+                        system_component TEXT NULL,
+                        last_modified TIMESTAMPTZ NULL,
+                        staleness_score DOUBLE PRECISION NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
+                    )
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_chunk_cache (
+                        id BIGSERIAL PRIMARY KEY,
+                        repo_id VARCHAR(512) NOT NULL,
+                        blob_sha VARCHAR(128) NOT NULL,
+                        chunk_index INTEGER NOT NULL,
+                        content TEXT NOT NULL,
+                        embedding vector NULL,
+                        section TEXT NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
+                        CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
+                    )
+                    """
+                )
+            )
+            conn.execute(
+                text(
+                    """
+                    CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
+                        id BIGSERIAL PRIMARY KEY,
+                        rag_session_id VARCHAR(64) NOT NULL,
+                        repo_id VARCHAR(512) NOT NULL,
+                        blob_sha VARCHAR(128) NOT NULL,
+                        chunk_index INTEGER NOT NULL,
+                        path TEXT NOT NULL,
+                        created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
+                    )
+                    """
+                )
+            )
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)"))
+            conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)"))
+            conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0"))
+            conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0"))
+            conn.commit()
+
+    def upsert_session(self, rag_session_id: str, project_id: str) -> None:
+        with get_engine().connect() as conn:
+            conn.execute(
+                text(
+                    """
+                    INSERT INTO rag_sessions (rag_session_id, project_id)
+                    VALUES (:sid, :pid)
+                    ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
+                    """
+                ),
+                {"sid": rag_session_id, "pid": project_id},
+            )
+            conn.commit()
+
+    def session_exists(self, rag_session_id: str) -> bool:
+        with get_engine().connect() as conn:
+            row = conn.execute(
+                text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
+                {"sid": rag_session_id},
+            ).fetchone()
+            return bool(row)
+
+    def get_session(self, rag_session_id: str) -> dict | None:
+        with get_engine().connect() as conn:
+            row = conn.execute(
+                text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
+                {"sid": rag_session_id},
+            ).mappings().fetchone()
+        return dict(row) if row else None
+
+    def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
+        with get_engine().connect() as conn:
+            conn.execute(
+                text(
+                    """
+                    INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
+                    VALUES (:jid, :sid, :status)
+                    """
+                ),
+                {"jid": index_job_id, "sid": rag_session_id, "status": status},
+            )
+            conn.commit()
+
+    def update_job(
+        self,
+        index_job_id: str,
+        *,
+        status: str,
+        indexed_files: int,
+        failed_files: int,
+        cache_hit_files: int = 0,
+        cache_miss_files: int = 0,
+        error_code: str | None = None,
+        error_desc: str | None = None,
+        error_module: str | None = None,
+    ) -> None:
+        with get_engine().connect() as conn:
+            conn.execute(
+                text(
+                    """
+                    UPDATE rag_index_jobs
+                    SET status = :status,
+                        indexed_files = :indexed,
+                        failed_files = :failed,
+                        cache_hit_files = :cache_hit_files,
+                        cache_miss_files = :cache_miss_files,
+                        error_code = :ecode,
+                        error_desc = :edesc,
+                        error_module = :emodule,
+                        updated_at = CURRENT_TIMESTAMP
+                    WHERE index_job_id = :jid
+                    """
+                ),
+                {
+                    "jid": index_job_id,
+                    "status": status,
+                    "indexed": indexed_files,
+                    "failed": failed_files,
+                    "cache_hit_files": cache_hit_files,
+                    "cache_miss_files": cache_miss_files,
+                    "ecode": error_code,
+                    "edesc": error_desc,
+                    "emodule": error_module,
+                },
+            )
+            conn.commit()
+
+    def get_job(self, index_job_id: str) -> RagJobRow | None:
+        with get_engine().connect() as conn:
+            row = conn.execute(
+                text(
+                    """
+                    SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
+                           cache_hit_files, cache_miss_files, error_code, error_desc, error_module
+                    FROM rag_index_jobs
+                    WHERE index_job_id = :jid
+                    """
+                ),
+                {"jid": index_job_id},
+            ).mappings().fetchone()
+        if not row:
+            return None
+        return RagJobRow(**dict(row))
+
+    def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None:
+        with get_engine().connect() as conn:
+            conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
+            conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
+            self._insert_chunks(conn, rag_session_id, items)
+            conn.commit()
+
+    def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None:
+        with get_engine().connect() as conn:
+            if delete_paths:
+                conn.execute(
+                    text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
+                    {"sid": rag_session_id, "paths": delete_paths},
+                )
+                conn.execute(
+                    text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
+                    {"sid": rag_session_id, "paths": delete_paths},
+                )
+            if upserts:
+                paths = sorted({str(x["path"]) for x in upserts})
+                conn.execute(
+                    text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
+                    {"sid": rag_session_id, "paths": paths},
+                )
+                conn.execute(
+                    text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
+                    {"sid": rag_session_id, "paths": paths},
+                )
+                self._insert_chunks(conn, rag_session_id, upserts)
+            conn.commit()
+
+    def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]:
+        with get_engine().connect() as conn:
+            rows = conn.execute(
+                text(
+                    """
+                    SELECT chunk_index, content, embedding::text AS embedding_txt, section
+                    FROM rag_chunk_cache
+                    WHERE repo_id = :repo_id AND blob_sha = :blob_sha
+                    ORDER BY chunk_index ASC
+                    """
+                ),
+                {"repo_id": repo_id, "blob_sha": blob_sha},
+            ).mappings().fetchall()
+        output: list[dict] = []
+        for row in rows:
+            output.append(
+                {
+                    "chunk_index": int(row["chunk_index"]),
+                    "content": str(row["content"] or ""),
+                    "embedding": self._parse_vector(str(row["embedding_txt"] or "")),
+                    "section": row.get("section"),
+                }
+            )
+        return output
+
+    def record_repo_cache(
+        self,
+        *,
+        project_id: str,
+        commit_sha: str | None,
+        changed_files: list[str],
+        summary: str,
+    ) -> None:
+        repo_session_id = f"repo:{project_id}"
+        with get_engine().connect() as conn:
+            for path in changed_files:
+                key = f"{commit_sha or 'no-commit'}:{path}"
+                blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest()
+                conn.execute(
+                    text(
+                        """
+                        INSERT INTO rag_blob_cache (
+                            repo_id,
+                            blob_sha,
+                            path,
+                            artifact_type,
+                            section
+                        )
+                        VALUES (
+                            :repo_id,
+                            :blob_sha,
+                            :path,
+                            :artifact_type,
+                            :section
+                        )
+                        ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
+                            updated_at = CURRENT_TIMESTAMP
+                        """
+                    ),
+                    {
+                        "repo_id": project_id,
+                        "blob_sha": blob_sha,
+                        "path": path,
+                        "artifact_type": "CODE",
+                        "section": "repo_webhook",
+                    },
+                )
+                conn.execute(
+                    text(
+                        """
+                        INSERT INTO rag_chunk_cache (
+                            repo_id,
+                            blob_sha,
+                            chunk_index,
+                            content,
+                            embedding,
+                            section
+                        )
+                        VALUES (
+                            :repo_id,
+                            :blob_sha,
+                            0,
+                            :content,
+                            NULL,
+                            :section
+                        )
+                        ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
+                            content = EXCLUDED.content,
+                            section = EXCLUDED.section,
+                            updated_at = CURRENT_TIMESTAMP
+                        """
+                    ),
+                    {
+                        "repo_id": project_id,
+                        "blob_sha": blob_sha,
+                        "content": f"repo_webhook:{path}:{summary[:300]}",
+                        "section": "repo_webhook",
+                    },
+                )
+                conn.execute(
+                    text(
+                        """
+                        INSERT INTO rag_session_chunk_map (
+                            rag_session_id,
+                            repo_id,
+                            blob_sha,
+                            chunk_index,
+                            path
+                        )
+                        VALUES (
+                            :rag_session_id,
+                            :repo_id,
+                            :blob_sha,
+                            0,
+                            :path
+                        )
+                        """
+                    ),
+                    {
+                        "rag_session_id": repo_session_id,
+                        "repo_id": project_id,
+                        "blob_sha": blob_sha,
+                        "path": path,
+                    },
+                )
+            conn.commit()
+
+    def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None:
+        if not items:
+            return
+        meta = items[0]
+        with get_engine().connect() as conn:
+            conn.execute(
+                text(
+                    """
+                    INSERT INTO rag_blob_cache (
+                        repo_id,
+                        blob_sha,
+                        path,
+                        artifact_type,
+                        section,
+                        doc_id,
+                        doc_version,
+                        owner,
+                        system_component,
+                        last_modified,
+                        staleness_score
+                    )
+                    VALUES (
+                        :repo_id,
+                        :blob_sha,
+                        :path,
+                        :artifact_type,
+                        :section,
+                        :doc_id,
+                        :doc_version,
+                        :owner,
+                        :system_component,
+                        :last_modified,
+                        :staleness_score
+                    )
+                    ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
+                        artifact_type = EXCLUDED.artifact_type,
+                        section = EXCLUDED.section,
+                        doc_id = EXCLUDED.doc_id,
+                        doc_version = EXCLUDED.doc_version,
+                        owner = EXCLUDED.owner,
+                        system_component = EXCLUDED.system_component,
+                        last_modified = EXCLUDED.last_modified,
+                        staleness_score = EXCLUDED.staleness_score,
+                        updated_at = CURRENT_TIMESTAMP
+                    """
+                ),
+                {
+                    "repo_id": repo_id,
+                    "blob_sha": blob_sha,
+                    "path": path,
+                    "artifact_type": meta.get("artifact_type"),
+                    "section": meta.get("section"),
+                    "doc_id": meta.get("doc_id"),
+                    "doc_version": meta.get("doc_version"),
+                    "owner": meta.get("owner"),
+                    "system_component": meta.get("system_component"),
+                    "last_modified": meta.get("last_modified"),
+                    "staleness_score": meta.get("staleness_score"),
+                },
+            )
+            for item in items:
+                emb = item.get("embedding") or []
+                emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
+                conn.execute(
+                    text(
+                        """
+                        INSERT INTO rag_chunk_cache (
+                            repo_id,
+                            blob_sha,
+                            chunk_index,
+                            content,
+                            embedding,
+                            section
+                        )
+                        VALUES (
+                            :repo_id,
+                            :blob_sha,
+                            :chunk_index,
+                            :content,
+                            CAST(:embedding AS vector),
+                            :section
+                        )
+                        ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
+                            content = EXCLUDED.content,
+                            embedding = EXCLUDED.embedding,
+                            section = EXCLUDED.section,
+                            updated_at = CURRENT_TIMESTAMP
+                        """
+                    ),
+                    {
+                        "repo_id": repo_id,
+                        "blob_sha": blob_sha,
+                        "chunk_index": int(item["chunk_index"]),
+                        "content": item["content"],
+                        "embedding": emb_str,
+                        "section": item.get("section"),
+                    },
+                )
+            conn.commit()
+
+    def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]:
+        emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
+        with get_engine().connect() as conn:
+            rows = conn.execute(
+                text(
+                    """
+                    SELECT path, content
+                    FROM rag_chunks
+                    WHERE rag_session_id = :sid
+                    ORDER BY embedding <=> CAST(:emb AS vector)
+                    LIMIT :lim
+                    """
+                ),
+                {"sid": rag_session_id, "emb": emb, "lim": limit},
+            ).mappings().fetchall()
+        return [dict(x) for x in rows]
+
+    def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]:
+        with get_engine().connect() as conn:
+            rows = conn.execute(
+                text(
+                    """
+                    SELECT path, content
+                    FROM rag_chunks
+                    WHERE rag_session_id = :sid
+                    ORDER BY id DESC
+                    LIMIT :lim
+                    """
+                ),
+                {"sid": rag_session_id, "lim": limit},
+            ).mappings().fetchall()
+        return [dict(x) for x in rows]
+
+    def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None:
+        for item in items:
+            emb = item.get("embedding") or []
+            emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
+            conn.execute(
+                text(
+                    """
+                    INSERT INTO rag_chunks (
+                        rag_session_id,
+                        path,
+                        chunk_index,
+                        content,
+                        embedding,
+                        artifact_type,
+                        section,
+                        doc_id,
+                        doc_version,
+                        owner,
+                        system_component,
+                        last_modified,
+                        staleness_score,
+                        created_at,
+                        updated_at
+                    )
+                    VALUES (
+                        :sid,
+                        :path,
+                        :idx,
+                        :content,
+                        CAST(:emb AS vector),
+                        :artifact_type,
+                        :section,
+                        :doc_id,
+                        :doc_version,
+                        :owner,
+                        :system_component,
+                        :last_modified,
+                        :staleness_score,
+                        CURRENT_TIMESTAMP,
+                        CURRENT_TIMESTAMP
+                    )
+                    """
+                ),
+                {
+                    "sid": rag_session_id,
+                    "path": item["path"],
+                    "idx": int(item["chunk_index"]),
+                    "content": item["content"],
+                    "emb": emb_str,
+                    "artifact_type": item.get("artifact_type"),
+                    "section": item.get("section"),
+                    "doc_id": item.get("doc_id"),
+                    "doc_version": item.get("doc_version"),
+                    "owner": item.get("owner"),
+                    "system_component": item.get("system_component"),
+                    "last_modified": item.get("last_modified"),
+                    "staleness_score": item.get("staleness_score"),
+                },
+            )
+            repo_id = str(item.get("repo_id") or "").strip()
+            blob_sha = str(item.get("blob_sha") or "").strip()
+            if repo_id and blob_sha:
+                conn.execute(
+                    text(
+                        """
+                        INSERT INTO rag_session_chunk_map (
+                            rag_session_id,
+                            repo_id,
+                            blob_sha,
+                            chunk_index,
+                            path
+                        ) VALUES (
+                            :sid,
+                            :repo_id,
+                            :blob_sha,
+                            :chunk_index,
+                            :path
+                        )
+                        """
+                    ),
+                    {
+                        "sid": rag_session_id,
+                        "repo_id": repo_id,
+                        "blob_sha": blob_sha,
+                        "chunk_index": int(item["chunk_index"]),
+                        "path": item["path"],
+                    },
+                )
+
+    def _parse_vector(self, value: str) -> list[float]:
+        text_value = value.strip()
+        if not text_value:
+            return []
+        if text_value.startswith("[") and text_value.endswith("]"):
+            text_value = text_value[1:-1]
+        if not text_value:
+            return []
+        return [float(part.strip()) for part in text_value.split(",") if part.strip()]