Новый раг
This commit is contained in:
Binary file not shown.
189
app/modules/rag/persistence/cache_repository.py
Normal file
189
app/modules/rag/persistence/cache_repository.py
Normal file
@@ -0,0 +1,189 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, RagDocument, RagSource, RagSpan
|
||||
from app.modules.rag.contracts.enums import RagLayer
|
||||
|
||||
|
||||
class RagCacheRepository:
|
||||
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||
with self._engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end,
|
||||
repo_id, commit_sha, embedding::text AS embedding_txt
|
||||
FROM rag_chunk_cache
|
||||
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
|
||||
ORDER BY chunk_index ASC
|
||||
"""
|
||||
),
|
||||
{"repo_id": repo_id, "blob_sha": blob_sha},
|
||||
).mappings().fetchall()
|
||||
docs: list[RagDocument] = []
|
||||
for row in rows:
|
||||
metadata = self._loads(row.get("metadata_json"))
|
||||
docs.append(
|
||||
RagDocument(
|
||||
layer=str(row["layer"]),
|
||||
lang=row.get("lang"),
|
||||
source=RagSource(
|
||||
repo_id=str(row["repo_id"]),
|
||||
commit_sha=row.get("commit_sha"),
|
||||
path=str(row["path"]),
|
||||
),
|
||||
title=str(row["title"] or row["path"]),
|
||||
text=str(row["content"] or ""),
|
||||
metadata=metadata,
|
||||
links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])],
|
||||
span=RagSpan(row.get("span_start"), row.get("span_end")),
|
||||
embedding=self._parse_vector(str(row["embedding_txt"] or "")),
|
||||
)
|
||||
)
|
||||
return docs
|
||||
|
||||
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
|
||||
if not docs:
|
||||
return
|
||||
with self._engine().connect() as conn:
|
||||
first = docs[0].to_record()
|
||||
first_meta = first["metadata"]
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_blob_cache (
|
||||
repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner,
|
||||
system_component, last_modified, staleness_score, layer, lang, metadata_json
|
||||
)
|
||||
VALUES (
|
||||
:repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner,
|
||||
:system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json
|
||||
)
|
||||
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
||||
artifact_type = EXCLUDED.artifact_type,
|
||||
section = EXCLUDED.section,
|
||||
doc_id = EXCLUDED.doc_id,
|
||||
doc_version = EXCLUDED.doc_version,
|
||||
owner = EXCLUDED.owner,
|
||||
system_component = EXCLUDED.system_component,
|
||||
last_modified = EXCLUDED.last_modified,
|
||||
staleness_score = EXCLUDED.staleness_score,
|
||||
layer = EXCLUDED.layer,
|
||||
lang = EXCLUDED.lang,
|
||||
metadata_json = EXCLUDED.metadata_json,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
"""
|
||||
),
|
||||
{
|
||||
"repo_id": repo_id,
|
||||
"blob_sha": blob_sha,
|
||||
"path": path,
|
||||
"artifact_type": first_meta.get("artifact_type"),
|
||||
"section": first_meta.get("section") or first_meta.get("section_title"),
|
||||
"doc_id": first_meta.get("doc_id"),
|
||||
"doc_version": first_meta.get("doc_version"),
|
||||
"owner": first_meta.get("owner"),
|
||||
"system_component": first_meta.get("system_component"),
|
||||
"last_modified": first_meta.get("last_modified"),
|
||||
"staleness_score": first_meta.get("staleness_score"),
|
||||
"layer": first["layer"],
|
||||
"lang": first["lang"],
|
||||
"metadata_json": json.dumps(first_meta, ensure_ascii=True),
|
||||
},
|
||||
)
|
||||
conn.execute(
|
||||
text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"),
|
||||
{"repo_id": repo_id, "blob_sha": blob_sha},
|
||||
)
|
||||
for idx, doc in enumerate(docs):
|
||||
row = doc.to_record()
|
||||
metadata = row["metadata"]
|
||||
emb = row["embedding"] or []
|
||||
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_chunk_cache (
|
||||
repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title,
|
||||
metadata_json, links_json, span_start, span_end, commit_sha
|
||||
)
|
||||
VALUES (
|
||||
:repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer,
|
||||
:lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha
|
||||
)
|
||||
"""
|
||||
),
|
||||
{
|
||||
"repo_id": repo_id,
|
||||
"blob_sha": blob_sha,
|
||||
"chunk_index": idx,
|
||||
"content": row["text"],
|
||||
"embedding": emb_str,
|
||||
"section": metadata.get("section") or metadata.get("section_title"),
|
||||
"layer": row["layer"],
|
||||
"lang": row["lang"],
|
||||
"path": row["path"],
|
||||
"title": row["title"],
|
||||
"metadata_json": json.dumps(metadata, ensure_ascii=True),
|
||||
"links_json": json.dumps(row["links"], ensure_ascii=True),
|
||||
"span_start": row["span_start"],
|
||||
"span_end": row["span_end"],
|
||||
"commit_sha": row["commit_sha"],
|
||||
},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def record_repo_cache(
|
||||
self,
|
||||
*,
|
||||
project_id: str,
|
||||
commit_sha: str | None,
|
||||
changed_files: list[str],
|
||||
summary: str,
|
||||
) -> None:
|
||||
docs: list[RagDocument] = []
|
||||
for idx, path in enumerate(changed_files):
|
||||
docs.append(
|
||||
RagDocument(
|
||||
layer=RagLayer.CODE_SOURCE_CHUNKS,
|
||||
lang="python" if path.endswith(".py") else None,
|
||||
source=RagSource(project_id, commit_sha, path),
|
||||
title=path,
|
||||
text=f"repo_webhook:{path}:{summary[:300]}",
|
||||
metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"},
|
||||
)
|
||||
)
|
||||
for doc in docs:
|
||||
blob_sha = self._blob_sha(commit_sha, doc.source.path)
|
||||
doc.metadata["blob_sha"] = blob_sha
|
||||
self.cache_documents(project_id, doc.source.path, blob_sha, [doc])
|
||||
|
||||
def _blob_sha(self, commit_sha: str | None, path: str) -> str:
|
||||
from hashlib import sha256
|
||||
|
||||
return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest()
|
||||
|
||||
def _engine(self):
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
return get_engine()
|
||||
|
||||
def _loads(self, value, default=None):
|
||||
if default is None:
|
||||
default = {}
|
||||
if not value:
|
||||
return default
|
||||
return json.loads(str(value))
|
||||
|
||||
def _parse_vector(self, value: str) -> list[float]:
|
||||
text_value = value.strip()
|
||||
if not text_value:
|
||||
return []
|
||||
if text_value.startswith("[") and text_value.endswith("]"):
|
||||
text_value = text_value[1:-1]
|
||||
if not text_value:
|
||||
return []
|
||||
return [float(part.strip()) for part in text_value.split(",") if part.strip()]
|
||||
122
app/modules/rag/persistence/document_repository.py
Normal file
122
app/modules/rag/persistence/document_repository.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.rag.contracts import RagDocument
|
||||
|
||||
|
||||
class RagDocumentRepository:
|
||||
def replace_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
||||
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
||||
self.insert_documents(conn, rag_session_id, docs)
|
||||
|
||||
def apply_document_changes(
|
||||
self,
|
||||
conn,
|
||||
rag_session_id: str,
|
||||
delete_paths: list[str],
|
||||
docs: list[RagDocument],
|
||||
) -> None:
|
||||
if delete_paths:
|
||||
conn.execute(
|
||||
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||
{"sid": rag_session_id, "paths": delete_paths},
|
||||
)
|
||||
conn.execute(
|
||||
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||
{"sid": rag_session_id, "paths": delete_paths},
|
||||
)
|
||||
if not docs:
|
||||
return
|
||||
paths = sorted({doc.source.path for doc in docs})
|
||||
conn.execute(
|
||||
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||
{"sid": rag_session_id, "paths": paths},
|
||||
)
|
||||
conn.execute(
|
||||
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||
{"sid": rag_session_id, "paths": paths},
|
||||
)
|
||||
self.insert_documents(conn, rag_session_id, docs)
|
||||
|
||||
def insert_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||
for doc in docs:
|
||||
row = doc.to_record()
|
||||
metadata = row["metadata"]
|
||||
links = row["links"]
|
||||
emb = row["embedding"] or []
|
||||
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_chunks (
|
||||
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
|
||||
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
|
||||
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
|
||||
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
|
||||
)
|
||||
VALUES (
|
||||
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
|
||||
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
|
||||
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
|
||||
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
|
||||
:module_id, :section_path, :doc_kind
|
||||
)
|
||||
"""
|
||||
),
|
||||
{
|
||||
"sid": rag_session_id,
|
||||
"path": row["path"],
|
||||
"chunk_index": int(metadata.get("chunk_index", 0)),
|
||||
"content": row["text"],
|
||||
"emb": emb_str,
|
||||
"artifact_type": metadata.get("artifact_type"),
|
||||
"section": metadata.get("section") or metadata.get("section_title"),
|
||||
"doc_id": metadata.get("doc_id"),
|
||||
"doc_version": metadata.get("doc_version"),
|
||||
"owner": metadata.get("owner"),
|
||||
"system_component": metadata.get("system_component"),
|
||||
"last_modified": metadata.get("last_modified"),
|
||||
"staleness_score": metadata.get("staleness_score"),
|
||||
"rag_doc_id": row["doc_id"],
|
||||
"layer": row["layer"],
|
||||
"lang": row["lang"],
|
||||
"repo_id": row["repo_id"],
|
||||
"commit_sha": row["commit_sha"],
|
||||
"title": row["title"],
|
||||
"metadata_json": json.dumps(metadata, ensure_ascii=True),
|
||||
"links_json": json.dumps(links, ensure_ascii=True),
|
||||
"span_start": row["span_start"],
|
||||
"span_end": row["span_end"],
|
||||
"symbol_id": metadata.get("symbol_id"),
|
||||
"qname": metadata.get("qname"),
|
||||
"kind": metadata.get("kind") or metadata.get("type"),
|
||||
"framework": metadata.get("framework"),
|
||||
"entrypoint_type": metadata.get("entry_type") or metadata.get("entrypoint_type"),
|
||||
"module_id": metadata.get("module_id") or metadata.get("policy_id"),
|
||||
"section_path": metadata.get("section_path"),
|
||||
"doc_kind": metadata.get("doc_kind"),
|
||||
},
|
||||
)
|
||||
repo_id = str(row["repo_id"] or "").strip()
|
||||
blob_sha = str(metadata.get("blob_sha") or "").strip()
|
||||
if repo_id and blob_sha:
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_session_chunk_map (
|
||||
rag_session_id, repo_id, blob_sha, chunk_index, path
|
||||
) VALUES (:sid, :repo_id, :blob_sha, :chunk_index, :path)
|
||||
"""
|
||||
),
|
||||
{
|
||||
"sid": rag_session_id,
|
||||
"repo_id": repo_id,
|
||||
"blob_sha": blob_sha,
|
||||
"chunk_index": int(metadata.get("chunk_index", 0)),
|
||||
"path": row["path"],
|
||||
},
|
||||
)
|
||||
95
app/modules/rag/persistence/job_repository.py
Normal file
95
app/modules/rag/persistence/job_repository.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
@dataclass
|
||||
class RagJobRow:
|
||||
index_job_id: str
|
||||
rag_session_id: str
|
||||
status: str
|
||||
indexed_files: int
|
||||
failed_files: int
|
||||
cache_hit_files: int
|
||||
cache_miss_files: int
|
||||
error_code: str | None
|
||||
error_desc: str | None
|
||||
error_module: str | None
|
||||
|
||||
|
||||
class RagJobRepository:
|
||||
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
|
||||
VALUES (:jid, :sid, :status)
|
||||
"""
|
||||
),
|
||||
{"jid": index_job_id, "sid": rag_session_id, "status": status},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def update_job(
|
||||
self,
|
||||
index_job_id: str,
|
||||
*,
|
||||
status: str,
|
||||
indexed_files: int,
|
||||
failed_files: int,
|
||||
cache_hit_files: int = 0,
|
||||
cache_miss_files: int = 0,
|
||||
error_code: str | None = None,
|
||||
error_desc: str | None = None,
|
||||
error_module: str | None = None,
|
||||
) -> None:
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE rag_index_jobs
|
||||
SET status = :status,
|
||||
indexed_files = :indexed,
|
||||
failed_files = :failed,
|
||||
cache_hit_files = :cache_hit_files,
|
||||
cache_miss_files = :cache_miss_files,
|
||||
error_code = :ecode,
|
||||
error_desc = :edesc,
|
||||
error_module = :emodule,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE index_job_id = :jid
|
||||
"""
|
||||
),
|
||||
{
|
||||
"jid": index_job_id,
|
||||
"status": status,
|
||||
"indexed": indexed_files,
|
||||
"failed": failed_files,
|
||||
"cache_hit_files": cache_hit_files,
|
||||
"cache_miss_files": cache_miss_files,
|
||||
"ecode": error_code,
|
||||
"edesc": error_desc,
|
||||
"emodule": error_module,
|
||||
},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
||||
with get_engine().connect() as conn:
|
||||
row = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
|
||||
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
|
||||
FROM rag_index_jobs
|
||||
WHERE index_job_id = :jid
|
||||
"""
|
||||
),
|
||||
{"jid": index_job_id},
|
||||
).mappings().fetchone()
|
||||
return RagJobRow(**dict(row)) if row else None
|
||||
111
app/modules/rag/persistence/query_repository.py
Normal file
111
app/modules/rag/persistence/query_repository.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class RagQueryRepository:
|
||||
def retrieve(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
query_embedding: list[float],
|
||||
*,
|
||||
query_text: str = "",
|
||||
limit: int = 5,
|
||||
layers: list[str] | None = None,
|
||||
path_prefixes: list[str] | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
||||
filters = ["rag_session_id = :sid"]
|
||||
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
|
||||
if layers:
|
||||
filters.append("layer = ANY(:layers)")
|
||||
params["layers"] = layers
|
||||
if path_prefixes:
|
||||
or_filters = []
|
||||
for idx, prefix in enumerate(path_prefixes):
|
||||
key = f"path_{idx}"
|
||||
params[key] = f"{prefix}%"
|
||||
or_filters.append(f"path LIKE :{key}")
|
||||
filters.append("(" + " OR ".join(or_filters) + ")")
|
||||
term_filters = []
|
||||
terms = extract_query_terms(query_text)
|
||||
for idx, term in enumerate(terms):
|
||||
exact_key = f"term_exact_{idx}"
|
||||
prefix_key = f"term_prefix_{idx}"
|
||||
contains_key = f"term_contains_{idx}"
|
||||
params[exact_key] = term
|
||||
params[prefix_key] = f"{term}%"
|
||||
params[contains_key] = f"%{term}%"
|
||||
term_filters.append(
|
||||
"CASE "
|
||||
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
|
||||
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
|
||||
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
|
||||
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
|
||||
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
|
||||
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
|
||||
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
|
||||
"ELSE 100 END"
|
||||
)
|
||||
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
|
||||
test_penalty_sql = (
|
||||
"CASE "
|
||||
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
|
||||
"THEN 1 ELSE 0 END"
|
||||
if prefer_non_tests
|
||||
else "0"
|
||||
)
|
||||
layer_rank_sql = (
|
||||
"CASE "
|
||||
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
|
||||
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
|
||||
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
|
||||
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
|
||||
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
|
||||
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
|
||||
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
|
||||
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
|
||||
"ELSE 10 END"
|
||||
)
|
||||
sql = f"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end,
|
||||
{lexical_sql} AS lexical_rank,
|
||||
{test_penalty_sql} AS test_penalty,
|
||||
{layer_rank_sql} AS layer_rank,
|
||||
(embedding <=> CAST(:emb AS vector)) AS distance
|
||||
FROM rag_chunks
|
||||
WHERE {' AND '.join(filters)}
|
||||
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
|
||||
LIMIT :lim
|
||||
"""
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
|
||||
filters = ["rag_session_id = :sid"]
|
||||
params: dict = {"sid": rag_session_id, "lim": limit}
|
||||
if layers:
|
||||
filters.append("layer = ANY(:layers)")
|
||||
params["layers"] = layers
|
||||
sql = f"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE {' AND '.join(filters)}
|
||||
ORDER BY id DESC
|
||||
LIMIT :lim
|
||||
"""
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
def _row_to_dict(self, row) -> dict:
|
||||
data = dict(row)
|
||||
data["metadata"] = json.loads(str(data.pop("metadata_json") or "{}"))
|
||||
return data
|
||||
82
app/modules/rag/persistence/repository.py
Normal file
82
app/modules/rag/persistence/repository.py
Normal file
@@ -0,0 +1,82 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument
|
||||
from app.modules.rag.persistence.cache_repository import RagCacheRepository
|
||||
from app.modules.rag.persistence.document_repository import RagDocumentRepository
|
||||
from app.modules.rag.persistence.job_repository import RagJobRepository, RagJobRow
|
||||
from app.modules.rag.persistence.query_repository import RagQueryRepository
|
||||
from app.modules.rag.persistence.schema_repository import RagSchemaRepository
|
||||
from app.modules.rag.persistence.session_repository import RagSessionRepository
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class RagRepository:
|
||||
def __init__(self) -> None:
|
||||
self._schema = RagSchemaRepository()
|
||||
self._sessions = RagSessionRepository()
|
||||
self._jobs = RagJobRepository()
|
||||
self._documents = RagDocumentRepository()
|
||||
self._cache = RagCacheRepository()
|
||||
self._query = RagQueryRepository()
|
||||
|
||||
def ensure_tables(self) -> None:
|
||||
self._schema.ensure_tables()
|
||||
|
||||
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
||||
self._sessions.upsert_session(rag_session_id, project_id)
|
||||
|
||||
def session_exists(self, rag_session_id: str) -> bool:
|
||||
return self._sessions.session_exists(rag_session_id)
|
||||
|
||||
def get_session(self, rag_session_id: str) -> dict | None:
|
||||
return self._sessions.get_session(rag_session_id)
|
||||
|
||||
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
||||
self._jobs.create_job(index_job_id, rag_session_id, status)
|
||||
|
||||
def update_job(self, index_job_id: str, **kwargs) -> None:
|
||||
self._jobs.update_job(index_job_id, **kwargs)
|
||||
|
||||
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
||||
return self._jobs.get_job(index_job_id)
|
||||
|
||||
def replace_documents(self, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||
with get_engine().connect() as conn:
|
||||
self._documents.replace_documents(conn, rag_session_id, docs)
|
||||
conn.commit()
|
||||
|
||||
def apply_document_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
|
||||
with get_engine().connect() as conn:
|
||||
self._documents.apply_document_changes(conn, rag_session_id, delete_paths, docs)
|
||||
conn.commit()
|
||||
|
||||
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||
return self._cache.get_cached_documents(repo_id, blob_sha)
|
||||
|
||||
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
|
||||
self._cache.cache_documents(repo_id, path, blob_sha, docs)
|
||||
|
||||
def record_repo_cache(self, **kwargs) -> None:
|
||||
self._cache.record_repo_cache(**kwargs)
|
||||
|
||||
def retrieve(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
query_embedding: list[float],
|
||||
*,
|
||||
query_text: str = "",
|
||||
limit: int = 5,
|
||||
layers: list[str] | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
return self._query.retrieve(
|
||||
rag_session_id,
|
||||
query_embedding,
|
||||
query_text=query_text,
|
||||
limit=limit,
|
||||
layers=layers,
|
||||
prefer_non_tests=prefer_non_tests,
|
||||
)
|
||||
|
||||
def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
|
||||
return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers)
|
||||
179
app/modules/rag/persistence/schema_repository.py
Normal file
179
app/modules/rag/persistence/schema_repository.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class RagSchemaRepository:
|
||||
def ensure_tables(self) -> None:
|
||||
engine = get_engine()
|
||||
with engine.connect() as conn:
|
||||
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_sessions (
|
||||
rag_session_id VARCHAR(64) PRIMARY KEY,
|
||||
project_id VARCHAR(512) NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_index_jobs (
|
||||
index_job_id VARCHAR(64) PRIMARY KEY,
|
||||
rag_session_id VARCHAR(64) NOT NULL,
|
||||
status VARCHAR(16) NOT NULL,
|
||||
indexed_files INTEGER NOT NULL DEFAULT 0,
|
||||
failed_files INTEGER NOT NULL DEFAULT 0,
|
||||
cache_hit_files INTEGER NOT NULL DEFAULT 0,
|
||||
cache_miss_files INTEGER NOT NULL DEFAULT 0,
|
||||
error_code VARCHAR(128) NULL,
|
||||
error_desc TEXT NULL,
|
||||
error_module VARCHAR(64) NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_chunks (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
rag_session_id VARCHAR(64) NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding vector NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_blob_cache (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
repo_id VARCHAR(512) NOT NULL,
|
||||
blob_sha VARCHAR(128) NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
repo_id VARCHAR(512) NOT NULL,
|
||||
blob_sha VARCHAR(128) NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding vector NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
rag_session_id VARCHAR(64) NOT NULL,
|
||||
repo_id VARCHAR(512) NOT NULL,
|
||||
blob_sha VARCHAR(128) NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
self._ensure_columns(conn)
|
||||
self._ensure_indexes(conn)
|
||||
conn.commit()
|
||||
|
||||
def _ensure_columns(self, conn) -> None:
|
||||
for statement in (
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS qname TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS kind TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS framework TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS entrypoint_type TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS module_id TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section_path TEXT NULL",
|
||||
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_kind TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS owner TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS path TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS title TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
|
||||
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
|
||||
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0",
|
||||
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0",
|
||||
):
|
||||
conn.execute(text(statement))
|
||||
|
||||
def _ensure_indexes(self, conn) -> None:
|
||||
for statement in (
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer ON rag_chunks (rag_session_id, layer)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer_path ON rag_chunks (rag_session_id, layer, path)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_qname ON rag_chunks (qname)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_symbol_id ON rag_chunks (symbol_id)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_module_id ON rag_chunks (module_id)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc_kind ON rag_chunks (doc_kind)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_entrypoint ON rag_chunks (entrypoint_type, framework)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)",
|
||||
):
|
||||
conn.execute(text(statement))
|
||||
37
app/modules/rag/persistence/session_repository.py
Normal file
37
app/modules/rag/persistence/session_repository.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class RagSessionRepository:
|
||||
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO rag_sessions (rag_session_id, project_id)
|
||||
VALUES (:sid, :pid)
|
||||
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
|
||||
"""
|
||||
),
|
||||
{"sid": rag_session_id, "pid": project_id},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def session_exists(self, rag_session_id: str) -> bool:
|
||||
with get_engine().connect() as conn:
|
||||
row = conn.execute(
|
||||
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
|
||||
{"sid": rag_session_id},
|
||||
).fetchone()
|
||||
return bool(row)
|
||||
|
||||
def get_session(self, rag_session_id: str) -> dict | None:
|
||||
with get_engine().connect() as conn:
|
||||
row = conn.execute(
|
||||
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
|
||||
{"sid": rag_session_id},
|
||||
).mappings().fetchone()
|
||||
return dict(row) if row else None
|
||||
Reference in New Issue
Block a user