from __future__ import annotations import json from sqlalchemy import text from app.modules.rag.contracts import EvidenceLink, RagDocument, RagSource, RagSpan from app.modules.rag.contracts.enums import RagLayer class RagCacheRepository: def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]: with self._engine().connect() as conn: rows = conn.execute( text( """ SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end, repo_id, commit_sha, embedding::text AS embedding_txt FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha ORDER BY chunk_index ASC """ ), {"repo_id": repo_id, "blob_sha": blob_sha}, ).mappings().fetchall() docs: list[RagDocument] = [] for row in rows: metadata = self._loads(row.get("metadata_json")) docs.append( RagDocument( layer=str(row["layer"]), lang=row.get("lang"), source=RagSource( repo_id=str(row["repo_id"]), commit_sha=row.get("commit_sha"), path=str(row["path"]), ), title=str(row["title"] or row["path"]), text=str(row["content"] or ""), metadata=metadata, links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])], span=RagSpan(row.get("span_start"), row.get("span_end")), embedding=self._parse_vector(str(row["embedding_txt"] or "")), ) ) return docs def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None: if not docs: return with self._engine().connect() as conn: first = docs[0].to_record() first_meta = first["metadata"] conn.execute( text( """ INSERT INTO rag_blob_cache ( repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner, system_component, last_modified, staleness_score, layer, lang, metadata_json ) VALUES ( :repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner, :system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json ) ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET artifact_type = EXCLUDED.artifact_type, section = EXCLUDED.section, doc_id = EXCLUDED.doc_id, doc_version = EXCLUDED.doc_version, owner = EXCLUDED.owner, system_component = EXCLUDED.system_component, last_modified = EXCLUDED.last_modified, staleness_score = EXCLUDED.staleness_score, layer = EXCLUDED.layer, lang = EXCLUDED.lang, metadata_json = EXCLUDED.metadata_json, updated_at = CURRENT_TIMESTAMP """ ), { "repo_id": repo_id, "blob_sha": blob_sha, "path": path, "artifact_type": first_meta.get("artifact_type"), "section": first_meta.get("section") or first_meta.get("section_title"), "doc_id": first_meta.get("doc_id"), "doc_version": first_meta.get("doc_version"), "owner": first_meta.get("owner"), "system_component": first_meta.get("system_component"), "last_modified": first_meta.get("last_modified"), "staleness_score": first_meta.get("staleness_score"), "layer": first["layer"], "lang": first["lang"], "metadata_json": json.dumps(first_meta, ensure_ascii=True), }, ) conn.execute( text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"), {"repo_id": repo_id, "blob_sha": blob_sha}, ) for idx, doc in enumerate(docs): row = doc.to_record() metadata = row["metadata"] emb = row["embedding"] or [] emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None conn.execute( text( """ INSERT INTO rag_chunk_cache ( repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title, metadata_json, links_json, span_start, span_end, commit_sha ) VALUES ( :repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer, :lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha ) """ ), { "repo_id": repo_id, "blob_sha": blob_sha, "chunk_index": idx, "content": row["text"], "embedding": emb_str, "section": metadata.get("section") or metadata.get("section_title"), "layer": row["layer"], "lang": row["lang"], "path": row["path"], "title": row["title"], "metadata_json": json.dumps(metadata, ensure_ascii=True), "links_json": json.dumps(row["links"], ensure_ascii=True), "span_start": row["span_start"], "span_end": row["span_end"], "commit_sha": row["commit_sha"], }, ) conn.commit() def record_repo_cache( self, *, project_id: str, commit_sha: str | None, changed_files: list[str], summary: str, ) -> None: docs: list[RagDocument] = [] for idx, path in enumerate(changed_files): docs.append( RagDocument( layer=RagLayer.CODE_SOURCE_CHUNKS, lang="python" if path.endswith(".py") else None, source=RagSource(project_id, commit_sha, path), title=path, text=f"repo_webhook:{path}:{summary[:300]}", metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"}, ) ) for doc in docs: blob_sha = self._blob_sha(commit_sha, doc.source.path) doc.metadata["blob_sha"] = blob_sha self.cache_documents(project_id, doc.source.path, blob_sha, [doc]) def _blob_sha(self, commit_sha: str | None, path: str) -> str: from hashlib import sha256 return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest() def _engine(self): from app.modules.shared.db import get_engine return get_engine() def _loads(self, value, default=None): if default is None: default = {} if not value: return default return json.loads(str(value)) def _parse_vector(self, value: str) -> list[float]: text_value = value.strip() if not text_value: return [] if text_value.startswith("[") and text_value.endswith("]"): text_value = text_value[1:-1] if not text_value: return [] return [float(part.strip()) for part in text_value.split(",") if part.strip()]