Фиксация изменений

This commit is contained in:
2026-03-05 11:03:17 +03:00
parent 1ef0b4d68c
commit 417b8b6f72
261 changed files with 8215 additions and 332 deletions

View File

@@ -46,7 +46,6 @@ class RagDocumentRepository:
for doc in docs:
row = doc.to_record()
metadata = row["metadata"]
links = row["links"]
emb = row["embedding"] or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
@@ -55,15 +54,15 @@ class RagDocumentRepository:
INSERT INTO rag_chunks (
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
layer, lang, repo_id, commit_sha, title, metadata_json, span_start, span_end, symbol_id,
qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
)
VALUES (
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
:module_id, :section_path, :doc_kind
CURRENT_TIMESTAMP, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
:span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type, :module_id,
:section_path, :doc_kind
)
"""
),
@@ -81,14 +80,12 @@ class RagDocumentRepository:
"system_component": metadata.get("system_component"),
"last_modified": metadata.get("last_modified"),
"staleness_score": metadata.get("staleness_score"),
"rag_doc_id": row["doc_id"],
"layer": row["layer"],
"lang": row["lang"],
"repo_id": row["repo_id"],
"commit_sha": row["commit_sha"],
"title": row["title"],
"metadata_json": json.dumps(metadata, ensure_ascii=True),
"links_json": json.dumps(links, ensure_ascii=True),
"span_start": row["span_start"],
"span_end": row["span_end"],
"symbol_id": metadata.get("symbol_id"),

View File

@@ -4,11 +4,14 @@ import json
from sqlalchemy import text
from app.modules.rag.retrieval.query_terms import extract_query_terms
from app.modules.rag.persistence.retrieval_statement_builder import RetrievalStatementBuilder
from app.modules.shared.db import get_engine
class RagQueryRepository:
def __init__(self) -> None:
self._builder = RetrievalStatementBuilder()
def retrieve(
self,
rag_session_id: str,
@@ -18,89 +21,47 @@ class RagQueryRepository:
limit: int = 5,
layers: list[str] | None = None,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
if path_prefixes:
or_filters = []
for idx, prefix in enumerate(path_prefixes):
key = f"path_{idx}"
params[key] = f"{prefix}%"
or_filters.append(f"path LIKE :{key}")
filters.append("(" + " OR ".join(or_filters) + ")")
term_filters = []
terms = extract_query_terms(query_text)
for idx, term in enumerate(terms):
exact_key = f"term_exact_{idx}"
prefix_key = f"term_prefix_{idx}"
contains_key = f"term_contains_{idx}"
params[exact_key] = term
params[prefix_key] = f"{term}%"
params[contains_key] = f"%{term}%"
term_filters.append(
"CASE "
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
"ELSE 100 END"
)
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
test_penalty_sql = (
"CASE "
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
"THEN 1 ELSE 0 END"
if prefer_non_tests
else "0"
sql, params = self._builder.build_retrieve(
rag_session_id,
query_embedding,
query_text=query_text,
limit=limit,
layers=layers,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)
layer_rank_sql = (
"CASE "
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
"ELSE 10 END"
)
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
{lexical_sql} AS lexical_rank,
{test_penalty_sql} AS test_penalty,
{layer_rank_sql} AS layer_rank,
(embedding <=> CAST(:emb AS vector)) AS distance
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
LIMIT :lim
"""
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY id DESC
LIMIT :lim
"""
def retrieve_lexical_code(
self,
rag_session_id: str,
*,
query_text: str,
limit: int = 5,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
sql, params = self._builder.build_lexical_code(
rag_session_id,
query_text=query_text,
limit=limit,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)
if sql is None:
return []
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]

View File

@@ -67,6 +67,9 @@ class RagRepository:
query_text: str = "",
limit: int = 5,
layers: list[str] | None = None,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
return self._query.retrieve(
@@ -75,8 +78,29 @@ class RagRepository:
query_text=query_text,
limit=limit,
layers=layers,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)
def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers)
def retrieve_lexical_code(
self,
rag_session_id: str,
query_text: str,
*,
limit: int = 5,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
return self._query.retrieve_lexical_code(
rag_session_id,
query_text=query_text,
limit=limit,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)

View File

@@ -0,0 +1,201 @@
from __future__ import annotations
from app.modules.rag.retrieval.query_terms import extract_query_terms
_LIKE_ESCAPE_SQL = " ESCAPE E'\\\\'"
class RetrievalStatementBuilder:
def build_retrieve(
self,
rag_session_id: str,
query_embedding: list[float],
*,
query_text: str = "",
limit: int = 5,
layers: list[str] | None = None,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> tuple[str, dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
self._append_prefix_group(filters, params, "path", path_prefixes)
self._append_prefix_group(filters, params, "exclude_prefix", exclude_path_prefixes, negate=True)
self._append_like_group(filters, params, "exclude_like", exclude_like_patterns, negate=True)
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
lexical_sql = self._lexical_rank_sql(query_text, params)
test_penalty_sql = self._test_penalty_sql(
prefer_non_tests,
params,
base_key="penalty",
path_prefixes=exclude_path_prefixes,
like_patterns=exclude_like_patterns,
)
layer_rank_sql = (
"CASE "
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
"ELSE 10 END"
)
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
{lexical_sql} AS lexical_rank,
{test_penalty_sql} AS test_penalty,
{layer_rank_sql} AS layer_rank,
(embedding <=> CAST(:emb AS vector)) AS distance
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
LIMIT :lim
"""
return sql, params
def build_lexical_code(
self,
rag_session_id: str,
*,
query_text: str,
limit: int = 5,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> tuple[str | None, dict]:
terms = extract_query_terms(query_text)
if not terms:
return None, {}
filters = ["rag_session_id = :sid", "layer = 'C0_SOURCE_CHUNKS'"]
params: dict = {"sid": rag_session_id, "lim": limit}
self._append_prefix_group(filters, params, "path", path_prefixes)
self._append_prefix_group(filters, params, "exclude_prefix", exclude_path_prefixes, negate=True)
self._append_like_group(filters, params, "exclude_like", exclude_like_patterns, negate=True)
lexical_filters: list[str] = []
lexical_ranks: list[str] = []
for idx, term in enumerate(terms):
exact_key = f"lex_exact_{idx}"
prefix_key = f"lex_prefix_{idx}"
contains_key = f"lex_contains_{idx}"
params[exact_key] = term
params[prefix_key] = f"{term}%"
params[contains_key] = f"%{term}%"
lexical_filters.append(
f"(lower(COALESCE(qname, '')) = :{exact_key} "
f"OR lower(COALESCE(title, '')) = :{exact_key} "
f"OR lower(COALESCE(path, '')) LIKE :{contains_key} "
f"OR lower(COALESCE(title, '')) LIKE :{prefix_key} "
f"OR lower(COALESCE(content, '')) LIKE :{contains_key})"
)
lexical_ranks.append(
"CASE "
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 1 "
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 2 "
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 3 "
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 4 "
"ELSE 100 END"
)
filters.append("(" + " OR ".join(lexical_filters) + ")")
lexical_sql = "LEAST(" + ", ".join(lexical_ranks) + ")"
test_penalty_sql = self._test_penalty_sql(
prefer_non_tests,
params,
base_key="lex_penalty",
path_prefixes=exclude_path_prefixes,
like_patterns=exclude_like_patterns,
)
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
{lexical_sql} AS lexical_rank,
{test_penalty_sql} AS test_penalty
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY lexical_rank ASC, test_penalty ASC, path ASC, span_start ASC
LIMIT :lim
"""
return sql, params
def _lexical_rank_sql(self, query_text: str, params: dict) -> str:
term_filters: list[str] = []
for idx, term in enumerate(extract_query_terms(query_text)):
exact_key = f"term_exact_{idx}"
prefix_key = f"term_prefix_{idx}"
contains_key = f"term_contains_{idx}"
params[exact_key] = term
params[prefix_key] = f"{term}%"
params[contains_key] = f"%{term}%"
term_filters.append(
"CASE "
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
"ELSE 100 END"
)
return "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
def _append_prefix_group(self, filters: list[str], params: dict, base_key: str, prefixes: list[str] | None, *, negate: bool = False) -> None:
if not prefixes:
return
items: list[str] = []
for idx, prefix in enumerate(prefixes):
key = f"{base_key}_{idx}"
params[key] = self._escape_like_value(prefix) + "%"
items.append(f"path LIKE :{key}{_LIKE_ESCAPE_SQL}")
self._append_group(filters, items, negate=negate)
def _append_like_group(self, filters: list[str], params: dict, base_key: str, patterns: list[str] | None, *, negate: bool = False) -> None:
if not patterns:
return
items: list[str] = []
for idx, pattern in enumerate(patterns):
key = f"{base_key}_{idx}"
params[key] = pattern
items.append(f"lower(path) LIKE :{key}{_LIKE_ESCAPE_SQL}")
self._append_group(filters, items, negate=negate)
def _append_group(self, filters: list[str], parts: list[str], *, negate: bool) -> None:
if not parts:
return
joined = " OR ".join(parts)
filters.append(f"NOT ({joined})" if negate else f"({joined})")
def _test_penalty_sql(
self,
enabled: bool,
params: dict,
*,
base_key: str,
path_prefixes: list[str] | None,
like_patterns: list[str] | None,
) -> str:
if not enabled:
return "0"
parts: list[str] = []
for idx, prefix in enumerate(path_prefixes or []):
key = f"{base_key}_prefix_{idx}"
params[key] = self._escape_like_value(prefix) + "%"
parts.append(f"lower(path) LIKE :{key}{_LIKE_ESCAPE_SQL}")
for idx, pattern in enumerate(like_patterns or []):
key = f"{base_key}_like_{idx}"
params[key] = pattern
parts.append(f"lower(path) LIKE :{key}{_LIKE_ESCAPE_SQL}")
if not parts:
return "0"
return "CASE WHEN " + " OR ".join(parts) + " THEN 1 ELSE 0 END"
def _escape_like_value(self, value: str) -> str:
return value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")

View File

@@ -106,6 +106,7 @@ class RagSchemaRepository:
)
self._ensure_columns(conn)
self._ensure_indexes(conn)
self._drop_unused_rag_chunk_columns(conn)
conn.commit()
def _ensure_columns(self, conn) -> None:
@@ -118,14 +119,12 @@ class RagSchemaRepository:
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL",
@@ -162,6 +161,13 @@ class RagSchemaRepository:
):
conn.execute(text(statement))
def _drop_unused_rag_chunk_columns(self, conn) -> None:
for statement in (
"ALTER TABLE rag_chunks DROP COLUMN IF EXISTS rag_doc_id",
"ALTER TABLE rag_chunks DROP COLUMN IF EXISTS links_json",
):
conn.execute(text(statement))
def _ensure_indexes(self, conn) -> None:
for statement in (
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)",