from __future__ import annotations from dataclasses import dataclass import hashlib from sqlalchemy import text from app.modules.shared.db import get_engine @dataclass class RagJobRow: index_job_id: str rag_session_id: str status: str indexed_files: int failed_files: int cache_hit_files: int cache_miss_files: int error_code: str | None error_desc: str | None error_module: str | None class RagRepository: def ensure_tables(self) -> None: engine = get_engine() with engine.connect() as conn: conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_sessions ( rag_session_id VARCHAR(64) PRIMARY KEY, project_id VARCHAR(512) NOT NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ) """ ) ) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_index_jobs ( index_job_id VARCHAR(64) PRIMARY KEY, rag_session_id VARCHAR(64) NOT NULL, status VARCHAR(16) NOT NULL, indexed_files INTEGER NOT NULL DEFAULT 0, failed_files INTEGER NOT NULL DEFAULT 0, cache_hit_files INTEGER NOT NULL DEFAULT 0, cache_miss_files INTEGER NOT NULL DEFAULT 0, error_code VARCHAR(128) NULL, error_desc TEXT NULL, error_module VARCHAR(64) NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ) """ ) ) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_chunks ( id BIGSERIAL PRIMARY KEY, rag_session_id VARCHAR(64) NOT NULL, path TEXT NOT NULL, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, embedding vector NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ) """ ) ) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL")) conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL")) conn.execute( text( """ ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP """ ) ) conn.execute( text( """ ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP """ ) ) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_blob_cache ( id BIGSERIAL PRIMARY KEY, repo_id VARCHAR(512) NOT NULL, blob_sha VARCHAR(128) NOT NULL, path TEXT NOT NULL, artifact_type VARCHAR(16) NULL, section TEXT NULL, doc_id TEXT NULL, doc_version TEXT NULL, owner TEXT NULL, system_component TEXT NULL, last_modified TIMESTAMPTZ NULL, staleness_score DOUBLE PRECISION NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path) ) """ ) ) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_chunk_cache ( id BIGSERIAL PRIMARY KEY, repo_id VARCHAR(512) NOT NULL, blob_sha VARCHAR(128) NOT NULL, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, embedding vector NULL, section TEXT NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index) ) """ ) ) conn.execute( text( """ CREATE TABLE IF NOT EXISTS rag_session_chunk_map ( id BIGSERIAL PRIMARY KEY, rag_session_id VARCHAR(64) NOT NULL, repo_id VARCHAR(512) NOT NULL, blob_sha VARCHAR(128) NOT NULL, chunk_index INTEGER NOT NULL, path TEXT NOT NULL, created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ) """ ) ) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)")) conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)")) conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0")) conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0")) conn.commit() def upsert_session(self, rag_session_id: str, project_id: str) -> None: with get_engine().connect() as conn: conn.execute( text( """ INSERT INTO rag_sessions (rag_session_id, project_id) VALUES (:sid, :pid) ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id """ ), {"sid": rag_session_id, "pid": project_id}, ) conn.commit() def session_exists(self, rag_session_id: str) -> bool: with get_engine().connect() as conn: row = conn.execute( text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"), {"sid": rag_session_id}, ).fetchone() return bool(row) def get_session(self, rag_session_id: str) -> dict | None: with get_engine().connect() as conn: row = conn.execute( text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"), {"sid": rag_session_id}, ).mappings().fetchone() return dict(row) if row else None def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: with get_engine().connect() as conn: conn.execute( text( """ INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status) VALUES (:jid, :sid, :status) """ ), {"jid": index_job_id, "sid": rag_session_id, "status": status}, ) conn.commit() def update_job( self, index_job_id: str, *, status: str, indexed_files: int, failed_files: int, cache_hit_files: int = 0, cache_miss_files: int = 0, error_code: str | None = None, error_desc: str | None = None, error_module: str | None = None, ) -> None: with get_engine().connect() as conn: conn.execute( text( """ UPDATE rag_index_jobs SET status = :status, indexed_files = :indexed, failed_files = :failed, cache_hit_files = :cache_hit_files, cache_miss_files = :cache_miss_files, error_code = :ecode, error_desc = :edesc, error_module = :emodule, updated_at = CURRENT_TIMESTAMP WHERE index_job_id = :jid """ ), { "jid": index_job_id, "status": status, "indexed": indexed_files, "failed": failed_files, "cache_hit_files": cache_hit_files, "cache_miss_files": cache_miss_files, "ecode": error_code, "edesc": error_desc, "emodule": error_module, }, ) conn.commit() def get_job(self, index_job_id: str) -> RagJobRow | None: with get_engine().connect() as conn: row = conn.execute( text( """ SELECT index_job_id, rag_session_id, status, indexed_files, failed_files, cache_hit_files, cache_miss_files, error_code, error_desc, error_module FROM rag_index_jobs WHERE index_job_id = :jid """ ), {"jid": index_job_id}, ).mappings().fetchone() if not row: return None return RagJobRow(**dict(row)) def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None: with get_engine().connect() as conn: conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id}) conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id}) self._insert_chunks(conn, rag_session_id, items) conn.commit() def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None: with get_engine().connect() as conn: if delete_paths: conn.execute( text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), {"sid": rag_session_id, "paths": delete_paths}, ) conn.execute( text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), {"sid": rag_session_id, "paths": delete_paths}, ) if upserts: paths = sorted({str(x["path"]) for x in upserts}) conn.execute( text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), {"sid": rag_session_id, "paths": paths}, ) conn.execute( text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), {"sid": rag_session_id, "paths": paths}, ) self._insert_chunks(conn, rag_session_id, upserts) conn.commit() def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]: with get_engine().connect() as conn: rows = conn.execute( text( """ SELECT chunk_index, content, embedding::text AS embedding_txt, section FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha ORDER BY chunk_index ASC """ ), {"repo_id": repo_id, "blob_sha": blob_sha}, ).mappings().fetchall() output: list[dict] = [] for row in rows: output.append( { "chunk_index": int(row["chunk_index"]), "content": str(row["content"] or ""), "embedding": self._parse_vector(str(row["embedding_txt"] or "")), "section": row.get("section"), } ) return output def record_repo_cache( self, *, project_id: str, commit_sha: str | None, changed_files: list[str], summary: str, ) -> None: repo_session_id = f"repo:{project_id}" with get_engine().connect() as conn: for path in changed_files: key = f"{commit_sha or 'no-commit'}:{path}" blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest() conn.execute( text( """ INSERT INTO rag_blob_cache ( repo_id, blob_sha, path, artifact_type, section ) VALUES ( :repo_id, :blob_sha, :path, :artifact_type, :section ) ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET updated_at = CURRENT_TIMESTAMP """ ), { "repo_id": project_id, "blob_sha": blob_sha, "path": path, "artifact_type": "CODE", "section": "repo_webhook", }, ) conn.execute( text( """ INSERT INTO rag_chunk_cache ( repo_id, blob_sha, chunk_index, content, embedding, section ) VALUES ( :repo_id, :blob_sha, 0, :content, NULL, :section ) ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET content = EXCLUDED.content, section = EXCLUDED.section, updated_at = CURRENT_TIMESTAMP """ ), { "repo_id": project_id, "blob_sha": blob_sha, "content": f"repo_webhook:{path}:{summary[:300]}", "section": "repo_webhook", }, ) conn.execute( text( """ INSERT INTO rag_session_chunk_map ( rag_session_id, repo_id, blob_sha, chunk_index, path ) VALUES ( :rag_session_id, :repo_id, :blob_sha, 0, :path ) """ ), { "rag_session_id": repo_session_id, "repo_id": project_id, "blob_sha": blob_sha, "path": path, }, ) conn.commit() def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None: if not items: return meta = items[0] with get_engine().connect() as conn: conn.execute( text( """ INSERT INTO rag_blob_cache ( repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner, system_component, last_modified, staleness_score ) VALUES ( :repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner, :system_component, :last_modified, :staleness_score ) ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET artifact_type = EXCLUDED.artifact_type, section = EXCLUDED.section, doc_id = EXCLUDED.doc_id, doc_version = EXCLUDED.doc_version, owner = EXCLUDED.owner, system_component = EXCLUDED.system_component, last_modified = EXCLUDED.last_modified, staleness_score = EXCLUDED.staleness_score, updated_at = CURRENT_TIMESTAMP """ ), { "repo_id": repo_id, "blob_sha": blob_sha, "path": path, "artifact_type": meta.get("artifact_type"), "section": meta.get("section"), "doc_id": meta.get("doc_id"), "doc_version": meta.get("doc_version"), "owner": meta.get("owner"), "system_component": meta.get("system_component"), "last_modified": meta.get("last_modified"), "staleness_score": meta.get("staleness_score"), }, ) for item in items: emb = item.get("embedding") or [] emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None conn.execute( text( """ INSERT INTO rag_chunk_cache ( repo_id, blob_sha, chunk_index, content, embedding, section ) VALUES ( :repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section ) ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET content = EXCLUDED.content, embedding = EXCLUDED.embedding, section = EXCLUDED.section, updated_at = CURRENT_TIMESTAMP """ ), { "repo_id": repo_id, "blob_sha": blob_sha, "chunk_index": int(item["chunk_index"]), "content": item["content"], "embedding": emb_str, "section": item.get("section"), }, ) conn.commit() def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]: emb = "[" + ",".join(str(x) for x in query_embedding) + "]" with get_engine().connect() as conn: rows = conn.execute( text( """ SELECT path, content FROM rag_chunks WHERE rag_session_id = :sid ORDER BY embedding <=> CAST(:emb AS vector) LIMIT :lim """ ), {"sid": rag_session_id, "emb": emb, "lim": limit}, ).mappings().fetchall() return [dict(x) for x in rows] def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]: with get_engine().connect() as conn: rows = conn.execute( text( """ SELECT path, content FROM rag_chunks WHERE rag_session_id = :sid ORDER BY id DESC LIMIT :lim """ ), {"sid": rag_session_id, "lim": limit}, ).mappings().fetchall() return [dict(x) for x in rows] def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None: for item in items: emb = item.get("embedding") or [] emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None conn.execute( text( """ INSERT INTO rag_chunks ( rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id, doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at ) VALUES ( :sid, :path, :idx, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id, :doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP ) """ ), { "sid": rag_session_id, "path": item["path"], "idx": int(item["chunk_index"]), "content": item["content"], "emb": emb_str, "artifact_type": item.get("artifact_type"), "section": item.get("section"), "doc_id": item.get("doc_id"), "doc_version": item.get("doc_version"), "owner": item.get("owner"), "system_component": item.get("system_component"), "last_modified": item.get("last_modified"), "staleness_score": item.get("staleness_score"), }, ) repo_id = str(item.get("repo_id") or "").strip() blob_sha = str(item.get("blob_sha") or "").strip() if repo_id and blob_sha: conn.execute( text( """ INSERT INTO rag_session_chunk_map ( rag_session_id, repo_id, blob_sha, chunk_index, path ) VALUES ( :sid, :repo_id, :blob_sha, :chunk_index, :path ) """ ), { "sid": rag_session_id, "repo_id": repo_id, "blob_sha": blob_sha, "chunk_index": int(item["chunk_index"]), "path": item["path"], }, ) def _parse_vector(self, value: str) -> list[float]: text_value = value.strip() if not text_value: return [] if text_value.startswith("[") and text_value.endswith("]"): text_value = text_value[1:-1] if not text_value: return [] return [float(part.strip()) for part in text_value.split(",") if part.strip()]