Новый раг
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,5 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
from app.modules.agent.engine.orchestrator.actions.common import ActionSupport
|
from app.modules.agent.engine.orchestrator.actions.common import ActionSupport
|
||||||
from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext
|
from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext
|
||||||
from app.modules.agent.engine.orchestrator.models import ArtifactType
|
from app.modules.agent.engine.orchestrator.models import ArtifactType
|
||||||
@@ -7,22 +9,28 @@ from app.modules.agent.engine.orchestrator.models import ArtifactType
|
|||||||
|
|
||||||
class ExplainActions(ActionSupport):
|
class ExplainActions(ActionSupport):
|
||||||
def collect_sources(self, ctx: ExecutionContext) -> list[str]:
|
def collect_sources(self, ctx: ExecutionContext) -> list[str]:
|
||||||
|
rag_items = list(ctx.task.metadata.get("rag_items", []) or [])
|
||||||
rag_context = str(ctx.task.metadata.get("rag_context", ""))
|
rag_context = str(ctx.task.metadata.get("rag_context", ""))
|
||||||
confluence_context = str(ctx.task.metadata.get("confluence_context", ""))
|
confluence_context = str(ctx.task.metadata.get("confluence_context", ""))
|
||||||
files_map = dict(ctx.task.metadata.get("files_map", {}) or {})
|
files_map = dict(ctx.task.metadata.get("files_map", {}) or {})
|
||||||
payload = {
|
payload = {
|
||||||
|
"rag_items": rag_items,
|
||||||
"rag_context": rag_context,
|
"rag_context": rag_context,
|
||||||
"confluence_context": confluence_context,
|
"confluence_context": confluence_context,
|
||||||
"files_count": len(files_map),
|
"files_count": len(files_map),
|
||||||
|
"source_profile": self._source_profile(rag_items),
|
||||||
}
|
}
|
||||||
evidence_ids: list[str] = []
|
evidence_ids: list[str] = []
|
||||||
if rag_context.strip():
|
for item in rag_items[:5]:
|
||||||
|
snippet = str(item.get("content", "") or "").strip()
|
||||||
|
if not snippet:
|
||||||
|
continue
|
||||||
evidence_ids.append(
|
evidence_ids.append(
|
||||||
self.add_evidence(
|
self.add_evidence(
|
||||||
ctx,
|
ctx,
|
||||||
source_type="rag_chunk",
|
source_type="rag_chunk",
|
||||||
source_ref=ctx.task.rag_session_id,
|
source_ref=str(item.get("source", ctx.task.rag_session_id)),
|
||||||
snippet=rag_context,
|
snippet=snippet,
|
||||||
score=0.9,
|
score=0.9,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -38,50 +46,214 @@ class ExplainActions(ActionSupport):
|
|||||||
def extract_logic(self, ctx: ExecutionContext) -> list[str]:
|
def extract_logic(self, ctx: ExecutionContext) -> list[str]:
|
||||||
sources = self.get(ctx, "sources", {}) or {}
|
sources = self.get(ctx, "sources", {}) or {}
|
||||||
message = ctx.task.user_message
|
message = ctx.task.user_message
|
||||||
|
profile = str(sources.get("source_profile", "docs"))
|
||||||
|
ru = self._is_russian(message)
|
||||||
|
notes = (
|
||||||
|
"Используй код как основной источник и ссылайся на конкретные файлы и слои."
|
||||||
|
if profile == "code" and ru
|
||||||
|
else "Use code as the primary source and cite concrete files/layers."
|
||||||
|
if profile == "code"
|
||||||
|
else "Используй требования и документацию как основной источник."
|
||||||
|
if ru
|
||||||
|
else "Use requirements/docs as primary source over code."
|
||||||
|
)
|
||||||
logic = {
|
logic = {
|
||||||
"request": message,
|
"request": message,
|
||||||
"assumptions": ["requirements-first"],
|
"assumptions": [f"{profile}-first"],
|
||||||
"notes": "Use requirements/docs as primary source over code.",
|
"notes": notes,
|
||||||
"source_summary": sources,
|
"source_summary": sources,
|
||||||
}
|
}
|
||||||
return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)]
|
return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)]
|
||||||
|
|
||||||
def build_sequence(self, ctx: ExecutionContext) -> list[str]:
|
|
||||||
message = ctx.task.user_message
|
|
||||||
mermaid = "\n".join(
|
|
||||||
[
|
|
||||||
"```mermaid",
|
|
||||||
"sequenceDiagram",
|
|
||||||
"participant User",
|
|
||||||
"participant Agent",
|
|
||||||
"participant Docs",
|
|
||||||
"User->>Agent: " + message[:80],
|
|
||||||
"Agent->>Docs: Find relevant requirements",
|
|
||||||
"Docs-->>Agent: Relevant context",
|
|
||||||
"Agent-->>User: Structured explanation",
|
|
||||||
"```",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return [self.put(ctx, "sequence_diagram", ArtifactType.TEXT, mermaid)]
|
|
||||||
|
|
||||||
def build_use_cases(self, ctx: ExecutionContext) -> list[str]:
|
|
||||||
lines = [
|
|
||||||
"### Use Cases",
|
|
||||||
"- Analyze requirement fragments relevant to user question",
|
|
||||||
"- Reconstruct behavior flow and decision points",
|
|
||||||
"- Return user-focused explanation with constraints",
|
|
||||||
]
|
|
||||||
return [self.put(ctx, "use_cases", ArtifactType.TEXT, "\n".join(lines))]
|
|
||||||
|
|
||||||
def summarize(self, ctx: ExecutionContext) -> list[str]:
|
def summarize(self, ctx: ExecutionContext) -> list[str]:
|
||||||
sequence = str(self.get(ctx, "sequence_diagram", "") or "")
|
sources = self.get(ctx, "sources", {}) or {}
|
||||||
use_cases = str(self.get(ctx, "use_cases", "") or "")
|
profile = str(sources.get("source_profile", "docs"))
|
||||||
answer = "\n\n".join(
|
items = list(sources.get("rag_items", []) or [])
|
||||||
[
|
message = ctx.task.user_message
|
||||||
"## Summary",
|
ru = self._is_russian(message)
|
||||||
"The requested project part is explained from requirements/docs context.",
|
answer = self._code_answer(items, russian=ru) if profile == "code" else self._docs_answer(items, russian=ru)
|
||||||
sequence,
|
|
||||||
use_cases,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)]
|
return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)]
|
||||||
|
|
||||||
|
def _source_profile(self, items: list[dict]) -> str:
|
||||||
|
layers = [str(item.get("layer", "") or "") for item in items]
|
||||||
|
if any(layer.startswith("C") for layer in layers):
|
||||||
|
return "code"
|
||||||
|
return "docs"
|
||||||
|
|
||||||
|
def _is_russian(self, text: str) -> bool:
|
||||||
|
return any("а" <= ch.lower() <= "я" or ch.lower() == "ё" for ch in text)
|
||||||
|
|
||||||
|
def _code_answer(self, items: list[dict], *, russian: bool) -> str:
|
||||||
|
if not items:
|
||||||
|
return (
|
||||||
|
"Не удалось найти релевантный кодовый контекст по этому запросу."
|
||||||
|
if russian
|
||||||
|
else "No relevant code context was found for this request."
|
||||||
|
)
|
||||||
|
details = self._code_details(items, russian=russian)
|
||||||
|
refs = self._code_references(items, russian=russian)
|
||||||
|
parts = [
|
||||||
|
"## Кратко" if russian else "## Summary",
|
||||||
|
details,
|
||||||
|
]
|
||||||
|
if refs:
|
||||||
|
parts.append(refs)
|
||||||
|
return "\n\n".join(part for part in parts if part.strip())
|
||||||
|
|
||||||
|
def _docs_answer(self, items: list[dict], *, russian: bool) -> str:
|
||||||
|
return (
|
||||||
|
"Запрошенная часть проекта объяснена на основе требований и документации."
|
||||||
|
if russian
|
||||||
|
else "The requested project part is explained from requirements/docs context."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _code_details(self, items: list[dict], *, russian: bool) -> str:
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
symbol_items = [item for item in items if str(item.get("layer", "")) == "C1_SYMBOL_CATALOG"]
|
||||||
|
edge_items = [item for item in items if str(item.get("layer", "")) == "C2_DEPENDENCY_GRAPH"]
|
||||||
|
source_items = [item for item in items if str(item.get("layer", "")) == "C0_SOURCE_CHUNKS"]
|
||||||
|
|
||||||
|
lines = ["### Что видно по коду" if russian else "### What the code shows"]
|
||||||
|
alias = self._find_alias_symbol(symbol_items)
|
||||||
|
if alias:
|
||||||
|
imported_from = str(alias.get("metadata", {}).get("lang_payload", {}).get("imported_from", "")).strip()
|
||||||
|
if russian:
|
||||||
|
lines.append(f"- `ConfigManager` в проекте доступен как alias в `{alias.get('source', '')}` и указывает на `{imported_from}`.")
|
||||||
|
else:
|
||||||
|
lines.append(f"- `ConfigManager` is exposed as an alias in `{alias.get('source', '')}` and points to `{imported_from}`.")
|
||||||
|
|
||||||
|
management_hint = self._management_summary(symbol_items, edge_items, source_items, russian=russian)
|
||||||
|
if management_hint:
|
||||||
|
lines.extend(management_hint)
|
||||||
|
|
||||||
|
symbol_lines = 0
|
||||||
|
for item in symbol_items[:4]:
|
||||||
|
title = str(item.get("title", "") or "")
|
||||||
|
source = str(item.get("source", "") or "")
|
||||||
|
content = str(item.get("content", "") or "").strip()
|
||||||
|
summary = content.splitlines()[-1].strip() if content else ""
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
if self._is_test_path(source):
|
||||||
|
continue
|
||||||
|
if self._is_control_symbol(title):
|
||||||
|
continue
|
||||||
|
if russian:
|
||||||
|
lines.append(f"- Символ `{title}` из `{source}`: {summary}")
|
||||||
|
else:
|
||||||
|
lines.append(f"- Symbol `{title}` from `{source}`: {summary}")
|
||||||
|
symbol_lines += 1
|
||||||
|
if symbol_lines >= 2:
|
||||||
|
break
|
||||||
|
|
||||||
|
edge_map: dict[str, list[str]] = {}
|
||||||
|
for item in edge_items:
|
||||||
|
meta = item.get("metadata", {}) or {}
|
||||||
|
src_qname = str(meta.get("src_qname", "") or "").strip()
|
||||||
|
dst_ref = str(meta.get("dst_ref", "") or "").strip()
|
||||||
|
if not src_qname or not dst_ref:
|
||||||
|
continue
|
||||||
|
if self._is_test_path(str(item.get("source", "") or "")):
|
||||||
|
continue
|
||||||
|
edge_map.setdefault(src_qname, [])
|
||||||
|
if dst_ref not in edge_map[src_qname]:
|
||||||
|
edge_map[src_qname].append(dst_ref)
|
||||||
|
for src_qname, targets in list(edge_map.items())[:3]:
|
||||||
|
joined = ", ".join(targets[:4])
|
||||||
|
if russian:
|
||||||
|
lines.append(f"- `{src_qname}` вызывает или использует: {joined}.")
|
||||||
|
else:
|
||||||
|
lines.append(f"- `{src_qname}` calls or uses: {joined}.")
|
||||||
|
|
||||||
|
for item in source_items[:2]:
|
||||||
|
source = str(item.get("source", "") or "")
|
||||||
|
content = str(item.get("content", "") or "")
|
||||||
|
if self._is_test_path(source):
|
||||||
|
continue
|
||||||
|
if "management" in content.lower() or "control" in content.lower():
|
||||||
|
snippet = " ".join(content.splitlines()[:4]).strip()
|
||||||
|
if russian:
|
||||||
|
lines.append(f"- В `{source}` есть прямое указание на управление через конфиг/API: `{snippet[:220]}`")
|
||||||
|
else:
|
||||||
|
lines.append(f"- `{source}` directly mentions config/API control: `{snippet[:220]}`")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def _code_references(self, items: list[dict], *, russian: bool) -> str:
|
||||||
|
paths = [str(item.get("source", "") or "") for item in items if item.get("source") and not self._is_test_path(str(item.get("source", "") or ""))]
|
||||||
|
if not paths:
|
||||||
|
return ""
|
||||||
|
lines = ["### Где смотреть в проекте" if russian else "### Where to look in the project"]
|
||||||
|
for path, _count in Counter(paths).most_common(3):
|
||||||
|
lines.append(f"- `{path}`")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def _find_alias_symbol(self, items: list[dict]) -> dict | None:
|
||||||
|
for item in items:
|
||||||
|
meta = item.get("metadata", {}) or {}
|
||||||
|
payload = meta.get("lang_payload", {}) or {}
|
||||||
|
qname = str(meta.get("qname", "") or "")
|
||||||
|
if qname == "ConfigManager" and payload.get("import_alias"):
|
||||||
|
return item
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _is_test_path(self, path: str) -> bool:
|
||||||
|
lowered = path.lower()
|
||||||
|
return lowered.startswith("tests/") or "/tests/" in lowered or lowered.startswith("test_") or "/test_" in lowered
|
||||||
|
|
||||||
|
def _is_control_symbol(self, title: str) -> bool:
|
||||||
|
lowered = title.lower()
|
||||||
|
return any(token in lowered for token in ("controlchannel", "controlchannelbridge", "on_start", "on_stop", "on_status"))
|
||||||
|
|
||||||
|
def _management_summary(
|
||||||
|
self,
|
||||||
|
symbol_items: list[dict],
|
||||||
|
edge_items: list[dict],
|
||||||
|
source_items: list[dict],
|
||||||
|
*,
|
||||||
|
russian: bool,
|
||||||
|
) -> list[str]:
|
||||||
|
qnames = {str((item.get("metadata", {}) or {}).get("qname", "") or ""): item for item in symbol_items if not self._is_test_path(str(item.get("source", "") or ""))}
|
||||||
|
source_texts = [str(item.get("content", "") or "") for item in source_items if not self._is_test_path(str(item.get("source", "") or ""))]
|
||||||
|
result: list[str] = []
|
||||||
|
|
||||||
|
if any("управление через api" in text.lower() or "section management" in text.lower() or "секция management" in text.lower() for text in source_texts):
|
||||||
|
result.append(
|
||||||
|
"- Для `ConfigManager` в коде предусмотрен отдельный интерфейс управления через API/конфиг: это прямо указано в публичной точке входа модуля."
|
||||||
|
if russian
|
||||||
|
else "- `ConfigManager` has a dedicated API/config-based management interface; this is stated in the module's public entrypoint."
|
||||||
|
)
|
||||||
|
|
||||||
|
has_control_channel = "ControlChannel" in qnames
|
||||||
|
has_bridge = "ControlChannelBridge" in qnames
|
||||||
|
if has_control_channel:
|
||||||
|
result.append(
|
||||||
|
"- Базовый контракт управления задает `ControlChannel`: он определяет команды `start` и `stop` для внешнего канала управления."
|
||||||
|
if russian
|
||||||
|
else "- The base management contract is `ControlChannel`, which defines external `start` and `stop` commands."
|
||||||
|
)
|
||||||
|
if has_bridge:
|
||||||
|
result.append(
|
||||||
|
"- `ControlChannelBridge` связывает внешний канал управления с lifecycle-методами менеджера: `on_start`, `on_stop`, `on_status`."
|
||||||
|
if russian
|
||||||
|
else "- `ControlChannelBridge` maps the external control channel to manager lifecycle methods: `on_start`, `on_stop`, `on_status`."
|
||||||
|
)
|
||||||
|
|
||||||
|
edge_refs = []
|
||||||
|
for item in edge_items:
|
||||||
|
if self._is_test_path(str(item.get("source", "") or "")):
|
||||||
|
continue
|
||||||
|
meta = item.get("metadata", {}) or {}
|
||||||
|
src = str(meta.get("src_qname", "") or "")
|
||||||
|
dst = str(meta.get("dst_ref", "") or "")
|
||||||
|
if src.startswith("ControlChannelBridge.") and dst in {"self._start_runtime", "self._stop_runtime", "self._get_status"}:
|
||||||
|
edge_refs.append((src, dst))
|
||||||
|
if edge_refs:
|
||||||
|
mappings = ", ".join(f"{src} -> {dst}" for src, dst in edge_refs[:3])
|
||||||
|
result.append(
|
||||||
|
f"- По связям в коде видно, что команды управления маршрутизируются так: {mappings}."
|
||||||
|
if russian
|
||||||
|
else f"- The code relationships show the management command routing: {mappings}."
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|||||||
@@ -24,8 +24,6 @@ class StepRegistry:
|
|||||||
"finalize_graph_output": self._finalize_graph_output,
|
"finalize_graph_output": self._finalize_graph_output,
|
||||||
"collect_sources": explain.collect_sources,
|
"collect_sources": explain.collect_sources,
|
||||||
"extract_logic": explain.extract_logic,
|
"extract_logic": explain.extract_logic,
|
||||||
"build_sequence": explain.build_sequence,
|
|
||||||
"build_use_cases": explain.build_use_cases,
|
|
||||||
"summarize": explain.summarize,
|
"summarize": explain.summarize,
|
||||||
"fetch_source_doc": review.fetch_source_doc,
|
"fetch_source_doc": review.fetch_source_doc,
|
||||||
"normalize_document": review.normalize_document,
|
"normalize_document": review.normalize_document,
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class TaskSpecBuilder:
|
|||||||
route: RoutingMeta,
|
route: RoutingMeta,
|
||||||
attachments: list[dict],
|
attachments: list[dict],
|
||||||
files: list[dict],
|
files: list[dict],
|
||||||
|
rag_items: list[dict],
|
||||||
rag_context: str,
|
rag_context: str,
|
||||||
confluence_context: str,
|
confluence_context: str,
|
||||||
files_map: dict[str, dict],
|
files_map: dict[str, dict],
|
||||||
@@ -32,6 +33,7 @@ class TaskSpecBuilder:
|
|||||||
output_contract = self._output_contract(scenario)
|
output_contract = self._output_contract(scenario)
|
||||||
constraints = self._constraints_for(scenario)
|
constraints = self._constraints_for(scenario)
|
||||||
metadata = {
|
metadata = {
|
||||||
|
"rag_items": rag_items,
|
||||||
"rag_context": rag_context,
|
"rag_context": rag_context,
|
||||||
"confluence_context": confluence_context,
|
"confluence_context": confluence_context,
|
||||||
"files_map": files_map,
|
"files_map": files_map,
|
||||||
@@ -124,14 +126,7 @@ class TaskSpecBuilder:
|
|||||||
|
|
||||||
def _output_contract(self, scenario: Scenario) -> OutputContract:
|
def _output_contract(self, scenario: Scenario) -> OutputContract:
|
||||||
if scenario == Scenario.EXPLAIN_PART:
|
if scenario == Scenario.EXPLAIN_PART:
|
||||||
return OutputContract(
|
return OutputContract(result_type="answer", sections=[OutputSection(name="summary", format="markdown")])
|
||||||
result_type="answer",
|
|
||||||
sections=[
|
|
||||||
OutputSection(name="sequence_diagram", format="mermaid"),
|
|
||||||
OutputSection(name="use_cases", format="markdown"),
|
|
||||||
OutputSection(name="summary", format="markdown"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
if scenario == Scenario.ANALYTICS_REVIEW:
|
if scenario == Scenario.ANALYTICS_REVIEW:
|
||||||
return OutputContract(
|
return OutputContract(
|
||||||
result_type="review_report",
|
result_type="review_report",
|
||||||
|
|||||||
@@ -43,9 +43,7 @@ class ScenarioTemplateRegistry:
|
|||||||
steps = [
|
steps = [
|
||||||
self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]),
|
self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]),
|
||||||
self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]),
|
self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]),
|
||||||
self._step("build_sequence", "Build sequence", "build_sequence", depends_on=["extract_logic"], outputs=[self._out("sequence_diagram", ArtifactType.TEXT)]),
|
self._step("summarize", "Summarize", "summarize", depends_on=["extract_logic"], outputs=[self._out("final_answer", ArtifactType.TEXT)]),
|
||||||
self._step("build_use_cases", "Build use cases", "build_use_cases", depends_on=["extract_logic"], outputs=[self._out("use_cases", ArtifactType.TEXT)]),
|
|
||||||
self._step("summarize", "Summarize", "summarize", depends_on=["build_sequence", "build_use_cases"], outputs=[self._out("final_answer", ArtifactType.TEXT)]),
|
|
||||||
]
|
]
|
||||||
return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")])
|
return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")])
|
||||||
|
|
||||||
|
|||||||
@@ -117,6 +117,7 @@ class GraphAgentRuntime:
|
|||||||
route=route_meta,
|
route=route_meta,
|
||||||
attachments=attachments,
|
attachments=attachments,
|
||||||
files=files,
|
files=files,
|
||||||
|
rag_items=rag_ctx,
|
||||||
rag_context=self._format_rag(rag_ctx),
|
rag_context=self._format_rag(rag_ctx),
|
||||||
confluence_context=self._format_confluence(conf_pages),
|
confluence_context=self._format_confluence(conf_pages),
|
||||||
files_map=files_map,
|
files_map=files_map,
|
||||||
@@ -368,7 +369,33 @@ class GraphAgentRuntime:
|
|||||||
return pages
|
return pages
|
||||||
|
|
||||||
def _format_rag(self, items: list[dict]) -> str:
|
def _format_rag(self, items: list[dict]) -> str:
|
||||||
return "\n".join(str(x.get("content", "")) for x in items)
|
blocks: list[str] = []
|
||||||
|
for item in items:
|
||||||
|
source = str(item.get("source", "") or item.get("path", "") or "")
|
||||||
|
layer = str(item.get("layer", "") or "").strip()
|
||||||
|
title = str(item.get("title", "") or "").strip()
|
||||||
|
metadata = item.get("metadata", {}) or {}
|
||||||
|
lines = []
|
||||||
|
if source:
|
||||||
|
lines.append(f"Source: {source}")
|
||||||
|
if layer:
|
||||||
|
lines.append(f"Layer: {layer}")
|
||||||
|
if title:
|
||||||
|
lines.append(f"Title: {title}")
|
||||||
|
if metadata:
|
||||||
|
hints = []
|
||||||
|
for key in ("module_id", "qname", "predicate", "entry_type", "framework", "section_path"):
|
||||||
|
value = metadata.get(key)
|
||||||
|
if value:
|
||||||
|
hints.append(f"{key}={value}")
|
||||||
|
if hints:
|
||||||
|
lines.append("Meta: " + ", ".join(hints))
|
||||||
|
content = str(item.get("content", "")).strip()
|
||||||
|
if content:
|
||||||
|
lines.append(content)
|
||||||
|
if lines:
|
||||||
|
blocks.append("\n".join(lines))
|
||||||
|
return "\n\n".join(blocks)
|
||||||
|
|
||||||
def _format_confluence(self, pages: list[dict]) -> str:
|
def _format_confluence(self, pages: list[dict]) -> str:
|
||||||
return "\n".join(str(x.get("content_markdown", "")) for x in pages)
|
return "\n".join(str(x.get("content_markdown", "")) for x in pages)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from app.modules.agent.repository import AgentRepository
|
|||||||
from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository
|
from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository
|
||||||
from app.modules.chat.repository import ChatRepository
|
from app.modules.chat.repository import ChatRepository
|
||||||
from app.modules.chat.module import ChatModule
|
from app.modules.chat.module import ChatModule
|
||||||
from app.modules.rag_session.repository import RagRepository
|
from app.modules.rag.persistence.repository import RagRepository
|
||||||
from app.modules.rag_session.module import RagModule
|
from app.modules.rag_session.module import RagModule
|
||||||
from app.modules.rag_repo.module import RagRepoModule
|
from app.modules.rag_repo.module import RagRepoModule
|
||||||
from app.modules.shared.bootstrap import bootstrap_database
|
from app.modules.shared.bootstrap import bootstrap_database
|
||||||
|
|||||||
300
app/modules/rag/README.md
Normal file
300
app/modules/rag/README.md
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
# Модуль rag
|
||||||
|
|
||||||
|
## 1. Функции модуля
|
||||||
|
- Единое ядро RAG для индексации и retrieval по документации и коду проекта.
|
||||||
|
- Поддержка двух семейств индексации: `DOCS` и `CODE`, с разными слоями и разными pipeline.
|
||||||
|
- Хранение `rag_session`, index-jobs, многослойных документов, cache-слоев и retrieval-запросов.
|
||||||
|
- Поддержка индексации snapshot и changes с переиспользованием cache по `blob_sha`.
|
||||||
|
- Предоставление контекста для agent/chat, где `DOCS` используется по умолчанию, а `CODE` включается для явных вопросов по реализации.
|
||||||
|
|
||||||
|
## 2. Диаграмма классов и взаимосвязей
|
||||||
|
```mermaid
|
||||||
|
classDiagram
|
||||||
|
class RagService
|
||||||
|
class RagRepository
|
||||||
|
class RagSchemaRepository
|
||||||
|
class RagDocumentUpserter
|
||||||
|
class DocsIndexingPipeline
|
||||||
|
class CodeIndexingPipeline
|
||||||
|
class RagQueryRouter
|
||||||
|
class GigaChatEmbedder
|
||||||
|
|
||||||
|
RagService --> RagRepository
|
||||||
|
RagService --> DocsIndexingPipeline
|
||||||
|
RagService --> CodeIndexingPipeline
|
||||||
|
RagService --> RagQueryRouter
|
||||||
|
RagService --> GigaChatEmbedder
|
||||||
|
RagRepository --> RagSchemaRepository
|
||||||
|
RagService --> RagDocumentUpserter
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Описание классов
|
||||||
|
- `RagService`: основной application-service модуля.
|
||||||
|
Методы: `index_snapshot` — индексирует полный набор файлов; `index_changes` — применяет инкрементальные изменения; `retrieve` — возвращает релевантный контекст из `DOCS` или `CODE`.
|
||||||
|
- `RagRepository`: фасад persistence-слоя RAG.
|
||||||
|
Методы: `ensure_tables` — создает/обновляет схему; `upsert_session/get_session/session_exists` — операции по `rag_session`; `create_job/update_job/get_job` — операции по index jobs; `replace_documents/apply_document_changes` — операции по документам; `get_cached_documents/cache_documents` — работа с cache; `retrieve/fallback_chunks` — retrieval.
|
||||||
|
- `RagSchemaRepository`: управление схемой БД для RAG.
|
||||||
|
Методы: `ensure_tables` — создает таблицы и индексы; `_ensure_columns` — добавляет новые поля; `_ensure_indexes` — поддерживает индексы для retrieval и фильтрации.
|
||||||
|
- `RagDocumentUpserter`: батчевый writer многослойных `RagDocument`.
|
||||||
|
Методы: `replace` — полностью заменяет документы сессии; `apply_changes` — применяет upsert/delete по измененным путям.
|
||||||
|
- `DocsIndexingPipeline`: pipeline индексации документации.
|
||||||
|
Методы: `supports` — определяет, относится ли файл к docs; `index_file` — строит документы слоев `D1-D4` для одного файла.
|
||||||
|
- `CodeIndexingPipeline`: pipeline индексации Python-кода.
|
||||||
|
Методы: `supports` — определяет, относится ли файл к code; `index_file` — строит документы слоев `C0-C3` для одного файла.
|
||||||
|
- `RagQueryRouter`: выбирает retrieval mode и активные слои.
|
||||||
|
Методы: `resolve_mode` — определяет `docs` или `code`; `layers_for_mode` — возвращает набор слоев для retrieval.
|
||||||
|
- `GigaChatEmbedder`: адаптер embeddings-модели.
|
||||||
|
Методы: `embed` — возвращает embeddings для списка текстов.
|
||||||
|
|
||||||
|
## 4. Сиквенс-диаграммы API и выполнения
|
||||||
|
|
||||||
|
### Индексация snapshot через текущий `rag_session` facade
|
||||||
|
Назначение: создать/обновить `rag_session` и построить многослойный индекс по переданным файлам проекта.
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant Router as RagModule.APIRouter
|
||||||
|
participant Sessions as RagSessionStore
|
||||||
|
participant Indexing as IndexingOrchestrator
|
||||||
|
participant Rag as RagService
|
||||||
|
participant Docs as DocsIndexingPipeline
|
||||||
|
participant Code as CodeIndexingPipeline
|
||||||
|
participant Repo as RagRepository
|
||||||
|
|
||||||
|
Router->>Sessions: create(project_id)
|
||||||
|
Sessions-->>Router: rag_session_id
|
||||||
|
Router->>Indexing: enqueue_snapshot(rag_session_id, files)
|
||||||
|
Indexing->>Rag: index_snapshot(rag_session_id, files)
|
||||||
|
loop for each file
|
||||||
|
Rag->>Docs: supports/index_file
|
||||||
|
Rag->>Code: supports/index_file
|
||||||
|
Rag->>Repo: cache_documents(...)
|
||||||
|
end
|
||||||
|
Rag->>Repo: replace_documents(...)
|
||||||
|
Indexing-->>Router: index_job_id,status
|
||||||
|
```
|
||||||
|
|
||||||
|
### Retrieval для agent/chat
|
||||||
|
Назначение: вернуть релевантный контекст из нужного семейства слоев.
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant Agent as GraphAgentRuntime
|
||||||
|
participant Rag as RagService
|
||||||
|
participant Router as RagQueryRouter
|
||||||
|
participant Repo as RagRepository
|
||||||
|
|
||||||
|
Agent->>Rag: retrieve(rag_session_id, query)
|
||||||
|
Rag->>Router: resolve_mode(query)
|
||||||
|
Router-->>Rag: docs|code + layers
|
||||||
|
Rag->>Repo: retrieve(query_embedding, query_text, layers)
|
||||||
|
Repo-->>Rag: ranked items
|
||||||
|
Rag-->>Agent: items
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5. Слои, фиксируемые в RAG
|
||||||
|
|
||||||
|
### 5.1. Слои DOCS
|
||||||
|
|
||||||
|
#### `D1_MODULE_CATALOG`
|
||||||
|
Назначение: каталог модулей документации и граф связей между ними.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `module_id`
|
||||||
|
- `type`
|
||||||
|
- `domain`
|
||||||
|
- `title`
|
||||||
|
- `status`
|
||||||
|
- `version`
|
||||||
|
- `tags`
|
||||||
|
- `owners`
|
||||||
|
- `links`
|
||||||
|
- `calls_api`
|
||||||
|
- `called_by`
|
||||||
|
- `uses_logic`
|
||||||
|
- `used_by`
|
||||||
|
- `reads_db`
|
||||||
|
- `writes_db`
|
||||||
|
- `integrates_with`
|
||||||
|
- `emits_events`
|
||||||
|
- `consumes_events`
|
||||||
|
- `source_path`
|
||||||
|
- `summary_text`
|
||||||
|
|
||||||
|
#### `D2_FACT_INDEX`
|
||||||
|
Назначение: атомарные факты `subject-predicate-object` с evidence.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `fact_id`
|
||||||
|
- `subject_id`
|
||||||
|
- `predicate`
|
||||||
|
- `object`
|
||||||
|
- `object_ref`
|
||||||
|
- `source_path`
|
||||||
|
- `anchor`
|
||||||
|
- `line_start`
|
||||||
|
- `line_end`
|
||||||
|
- `confidence`
|
||||||
|
- `tags`
|
||||||
|
|
||||||
|
#### `D3_SECTION_INDEX`
|
||||||
|
Назначение: семантические секции документации, нарезанные по заголовкам.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `chunk_id`
|
||||||
|
- `module_id`
|
||||||
|
- `section_path`
|
||||||
|
- `section_title`
|
||||||
|
- `content`
|
||||||
|
- `source_path`
|
||||||
|
- `order`
|
||||||
|
- `tags`
|
||||||
|
- `domain`
|
||||||
|
- `type`
|
||||||
|
- `embedding`
|
||||||
|
|
||||||
|
#### `D4_POLICY_INDEX`
|
||||||
|
Назначение: глобальные правила и конвенции проекта.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `policy_id`
|
||||||
|
- `applies_to`
|
||||||
|
- `rules`
|
||||||
|
- `default_behaviors`
|
||||||
|
- `source_path`
|
||||||
|
|
||||||
|
### 5.2. Слои CODE
|
||||||
|
|
||||||
|
#### `C0_SOURCE_CHUNKS`
|
||||||
|
Назначение: сырой код как источник истины для цитирования и evidence.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `lang`
|
||||||
|
- `repo_id`
|
||||||
|
- `commit_sha`
|
||||||
|
- `path`
|
||||||
|
- `span`
|
||||||
|
- `title`
|
||||||
|
- `text`
|
||||||
|
- `module_or_unit`
|
||||||
|
- `chunk_type`
|
||||||
|
- `symbol_id`
|
||||||
|
- `hash`
|
||||||
|
|
||||||
|
#### `C1_SYMBOL_CATALOG`
|
||||||
|
Назначение: каталог символов кода и их деклараций.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `lang`
|
||||||
|
- `repo_id`
|
||||||
|
- `commit_sha`
|
||||||
|
- `symbol_id`
|
||||||
|
- `qname`
|
||||||
|
- `kind`
|
||||||
|
- `decl.path`
|
||||||
|
- `decl.start_line`
|
||||||
|
- `decl.end_line`
|
||||||
|
- `text`
|
||||||
|
- `visibility`
|
||||||
|
- `signature`
|
||||||
|
- `decorators_or_annotations`
|
||||||
|
- `docstring_or_javadoc`
|
||||||
|
- `parent_symbol_id`
|
||||||
|
- `package_or_module`
|
||||||
|
- `is_entry_candidate`
|
||||||
|
- `lang_payload`
|
||||||
|
|
||||||
|
#### `C2_DEPENDENCY_GRAPH`
|
||||||
|
Назначение: связи между сущностями кода.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `lang`
|
||||||
|
- `repo_id`
|
||||||
|
- `commit_sha`
|
||||||
|
- `edge_id`
|
||||||
|
- `edge_type`
|
||||||
|
- `src_symbol_id`
|
||||||
|
- `dst_symbol_id`
|
||||||
|
- `dst_ref`
|
||||||
|
- `evidence.path`
|
||||||
|
- `evidence.start_line`
|
||||||
|
- `evidence.end_line`
|
||||||
|
- `text`
|
||||||
|
- `resolution`
|
||||||
|
- `callsite_kind`
|
||||||
|
- `lang_payload`
|
||||||
|
|
||||||
|
#### `C3_ENTRYPOINTS`
|
||||||
|
Назначение: точки входа приложения и их обработчики.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `lang`
|
||||||
|
- `repo_id`
|
||||||
|
- `commit_sha`
|
||||||
|
- `entry_id`
|
||||||
|
- `entry_type`
|
||||||
|
- `framework`
|
||||||
|
- `route_or_command`
|
||||||
|
- `handler_symbol_id`
|
||||||
|
- `evidence.path`
|
||||||
|
- `evidence.start_line`
|
||||||
|
- `evidence.end_line`
|
||||||
|
- `text`
|
||||||
|
- `http.methods`
|
||||||
|
- `http.auth`
|
||||||
|
- `request_model`
|
||||||
|
- `response_model`
|
||||||
|
- `cli.args_schema`
|
||||||
|
- `task.queue`
|
||||||
|
- `task.cron`
|
||||||
|
- `tags`
|
||||||
|
- `lang_payload`
|
||||||
|
|
||||||
|
#### `C4_PUBLIC_API`
|
||||||
|
Назначение: публичная поверхность API/экспортируемых символов.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `api_id`
|
||||||
|
- `symbol_id`
|
||||||
|
- `stability`
|
||||||
|
- `source_of_truth`
|
||||||
|
- `versioning_tags`
|
||||||
|
- `lang_payload`
|
||||||
|
|
||||||
|
#### `C5_BEHAVIOR_SUMMARIES`
|
||||||
|
Назначение: поведенческие summary с обязательными evidence links.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `target_type`
|
||||||
|
- `target_id`
|
||||||
|
- `text`
|
||||||
|
- `claims`
|
||||||
|
- `evidence_links`
|
||||||
|
- `confidence`
|
||||||
|
- `generated_by`
|
||||||
|
- `generated_at`
|
||||||
|
|
||||||
|
#### `C6_RUNTIME_TRACES`
|
||||||
|
Назначение: runtime/trace слой для связи кода и реального исполнения.
|
||||||
|
|
||||||
|
Основные атрибуты:
|
||||||
|
- `env`
|
||||||
|
- `trace_id`
|
||||||
|
- `span_id`
|
||||||
|
- `symbol_id`
|
||||||
|
- `entry_id`
|
||||||
|
- `text`
|
||||||
|
- `timings`
|
||||||
|
- `service`
|
||||||
|
- `host`
|
||||||
|
- `labels`
|
||||||
|
|
||||||
|
## 6. Правила retrieval
|
||||||
|
- По умолчанию retrieval идет в `DOCS`.
|
||||||
|
- `CODE` используется только для явных вопросов по реализации, устройству кода, endpoint'ам, handler'ам и документации “из кода”.
|
||||||
|
- Для `DOCS` приоритет слоев: `D1 -> D2 -> D3 -> D4`.
|
||||||
|
- Для `CODE` приоритет слоев: `C3 -> C1 -> C2 -> C0`.
|
||||||
|
|
||||||
|
## 7. Текущий статус реализации
|
||||||
|
- В первой итерации реализованы `DOCS D1-D4`.
|
||||||
|
- В первой итерации реализованы `CODE C0-C3`.
|
||||||
|
- `C4-C6` зафиксированы в контракте и зарезервированы под следующие этапы.
|
||||||
|
- Текущие `rag_session` и `rag_repo` работают как facade/adapter поверх нового пакета `rag`.
|
||||||
25
app/modules/rag/__init__.py
Normal file
25
app/modules/rag/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from app.modules.rag.contracts import (
|
||||||
|
DocKind,
|
||||||
|
EvidenceLink,
|
||||||
|
EvidenceType,
|
||||||
|
RagDocument,
|
||||||
|
RagLayer,
|
||||||
|
RagSource,
|
||||||
|
RagSpan,
|
||||||
|
RetrievalItem,
|
||||||
|
RetrievalMode,
|
||||||
|
RetrievalQuery,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DocKind",
|
||||||
|
"EvidenceLink",
|
||||||
|
"EvidenceType",
|
||||||
|
"RagDocument",
|
||||||
|
"RagLayer",
|
||||||
|
"RagSource",
|
||||||
|
"RagSpan",
|
||||||
|
"RetrievalItem",
|
||||||
|
"RetrievalMode",
|
||||||
|
"RetrievalQuery",
|
||||||
|
]
|
||||||
BIN
app/modules/rag/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
app/modules/rag/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
17
app/modules/rag/contracts/__init__.py
Normal file
17
app/modules/rag/contracts/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from app.modules.rag.contracts.documents import RagDocument, RagSource, RagSpan
|
||||||
|
from app.modules.rag.contracts.enums import DocKind, EvidenceType, RagLayer, RetrievalMode
|
||||||
|
from app.modules.rag.contracts.evidence import EvidenceLink
|
||||||
|
from app.modules.rag.contracts.retrieval import RetrievalItem, RetrievalQuery
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DocKind",
|
||||||
|
"EvidenceLink",
|
||||||
|
"EvidenceType",
|
||||||
|
"RagDocument",
|
||||||
|
"RagLayer",
|
||||||
|
"RagSource",
|
||||||
|
"RagSpan",
|
||||||
|
"RetrievalItem",
|
||||||
|
"RetrievalMode",
|
||||||
|
"RetrievalQuery",
|
||||||
|
]
|
||||||
BIN
app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc
Normal file
BIN
app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc
Normal file
BIN
app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc
Normal file
BIN
app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc
Normal file
BIN
app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc
Normal file
Binary file not shown.
77
app/modules/rag/contracts/documents.py
Normal file
77
app/modules/rag/contracts/documents.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import asdict, dataclass, field
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
from app.modules.rag.contracts.evidence import EvidenceLink
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class RagSource:
|
||||||
|
repo_id: str
|
||||||
|
commit_sha: str | None
|
||||||
|
path: str
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class RagSpan:
|
||||||
|
start_line: int | None = None
|
||||||
|
end_line: int | None = None
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class RagDocument:
|
||||||
|
layer: str
|
||||||
|
source: RagSource
|
||||||
|
title: str
|
||||||
|
text: str
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
|
links: list[EvidenceLink] = field(default_factory=list)
|
||||||
|
span: RagSpan | None = None
|
||||||
|
doc_id: str | None = None
|
||||||
|
lang: str | None = None
|
||||||
|
embedding: list[float] | None = None
|
||||||
|
|
||||||
|
def ensure_doc_id(self) -> str:
|
||||||
|
if self.doc_id:
|
||||||
|
return self.doc_id
|
||||||
|
span_key = ""
|
||||||
|
if self.span is not None:
|
||||||
|
span_key = f":{self.span.start_line}:{self.span.end_line}"
|
||||||
|
raw = "|".join(
|
||||||
|
[
|
||||||
|
self.layer,
|
||||||
|
self.lang or "",
|
||||||
|
self.source.repo_id,
|
||||||
|
self.source.commit_sha or "",
|
||||||
|
self.source.path,
|
||||||
|
self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""),
|
||||||
|
self.title,
|
||||||
|
span_key,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.doc_id = sha256(raw.encode("utf-8")).hexdigest()
|
||||||
|
return self.doc_id
|
||||||
|
|
||||||
|
def to_record(self) -> dict:
|
||||||
|
return {
|
||||||
|
"doc_id": self.ensure_doc_id(),
|
||||||
|
"layer": self.layer,
|
||||||
|
"lang": self.lang,
|
||||||
|
"repo_id": self.source.repo_id,
|
||||||
|
"commit_sha": self.source.commit_sha,
|
||||||
|
"path": self.source.path,
|
||||||
|
"title": self.title,
|
||||||
|
"text": self.text,
|
||||||
|
"metadata": dict(self.metadata),
|
||||||
|
"links": [link.to_dict() for link in self.links],
|
||||||
|
"span_start": self.span.start_line if self.span else None,
|
||||||
|
"span_end": self.span.end_line if self.span else None,
|
||||||
|
"embedding": self.embedding or [],
|
||||||
|
}
|
||||||
35
app/modules/rag/contracts/enums.py
Normal file
35
app/modules/rag/contracts/enums.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
class RagLayer:
|
||||||
|
DOCS_MODULE_CATALOG = "D1_MODULE_CATALOG"
|
||||||
|
DOCS_FACT_INDEX = "D2_FACT_INDEX"
|
||||||
|
DOCS_SECTION_INDEX = "D3_SECTION_INDEX"
|
||||||
|
DOCS_POLICY_INDEX = "D4_POLICY_INDEX"
|
||||||
|
CODE_SOURCE_CHUNKS = "C0_SOURCE_CHUNKS"
|
||||||
|
CODE_SYMBOL_CATALOG = "C1_SYMBOL_CATALOG"
|
||||||
|
CODE_DEPENDENCY_GRAPH = "C2_DEPENDENCY_GRAPH"
|
||||||
|
CODE_ENTRYPOINTS = "C3_ENTRYPOINTS"
|
||||||
|
CODE_PUBLIC_API = "C4_PUBLIC_API"
|
||||||
|
CODE_BEHAVIOR_SUMMARIES = "C5_BEHAVIOR_SUMMARIES"
|
||||||
|
CODE_RUNTIME_TRACES = "C6_RUNTIME_TRACES"
|
||||||
|
|
||||||
|
|
||||||
|
class RetrievalMode:
|
||||||
|
DOCS = "docs"
|
||||||
|
CODE = "code"
|
||||||
|
|
||||||
|
|
||||||
|
class DocKind:
|
||||||
|
SPEC = "spec"
|
||||||
|
RUNBOOK = "runbook"
|
||||||
|
README = "readme"
|
||||||
|
MISC = "misc"
|
||||||
|
|
||||||
|
|
||||||
|
class EvidenceType:
|
||||||
|
CODE_SPAN = "code_span"
|
||||||
|
SYMBOL = "symbol"
|
||||||
|
EDGE = "edge"
|
||||||
|
DOC_SECTION = "doc_section"
|
||||||
|
DOC_FACT = "doc_fact"
|
||||||
16
app/modules/rag/contracts/evidence.py
Normal file
16
app/modules/rag/contracts/evidence.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class EvidenceLink:
|
||||||
|
type: str
|
||||||
|
target_id: str
|
||||||
|
path: str | None = None
|
||||||
|
start_line: int | None = None
|
||||||
|
end_line: int | None = None
|
||||||
|
note: str | None = None
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return asdict(self)
|
||||||
23
app/modules/rag/contracts/retrieval.py
Normal file
23
app/modules/rag/contracts/retrieval.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class RetrievalQuery:
|
||||||
|
text: str
|
||||||
|
mode: str
|
||||||
|
limit: int = 5
|
||||||
|
layers: list[str] = field(default_factory=list)
|
||||||
|
path_prefixes: list[str] = field(default_factory=list)
|
||||||
|
doc_kind: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class RetrievalItem:
|
||||||
|
content: str
|
||||||
|
path: str
|
||||||
|
layer: str
|
||||||
|
title: str
|
||||||
|
score: float | None = None
|
||||||
|
metadata: dict | None = None
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class CodeChunk:
|
||||||
|
title: str
|
||||||
|
text: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
chunk_type: str
|
||||||
|
|
||||||
|
|
||||||
|
class CodeTextChunker:
|
||||||
|
def chunk(self, path: str, text: str) -> list[CodeChunk]:
|
||||||
|
try:
|
||||||
|
tree = ast.parse(text)
|
||||||
|
except SyntaxError:
|
||||||
|
return self._window_chunks(path, text)
|
||||||
|
chunks: list[CodeChunk] = []
|
||||||
|
lines = text.splitlines()
|
||||||
|
for node in tree.body:
|
||||||
|
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||||
|
continue
|
||||||
|
start = int(getattr(node, "lineno", 1))
|
||||||
|
end = int(getattr(node, "end_lineno", start))
|
||||||
|
body = "\n".join(lines[start - 1 : end]).strip()
|
||||||
|
if not body:
|
||||||
|
continue
|
||||||
|
chunks.append(
|
||||||
|
CodeChunk(
|
||||||
|
title=f"{path}:{getattr(node, 'name', 'block')}",
|
||||||
|
text=body,
|
||||||
|
start_line=start,
|
||||||
|
end_line=end,
|
||||||
|
chunk_type="symbol_block",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return chunks or self._window_chunks(path, text)
|
||||||
|
|
||||||
|
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
|
||||||
|
lines = text.splitlines()
|
||||||
|
chunks: list[CodeChunk] = []
|
||||||
|
size = 80
|
||||||
|
overlap = 15
|
||||||
|
start = 0
|
||||||
|
while start < len(lines):
|
||||||
|
end = min(len(lines), start + size)
|
||||||
|
body = "\n".join(lines[start:end]).strip()
|
||||||
|
if body:
|
||||||
|
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
|
||||||
|
if end >= len(lines):
|
||||||
|
break
|
||||||
|
start = max(0, end - overlap)
|
||||||
|
return chunks
|
||||||
22
app/modules/rag/indexing/code/code_text/document_builder.py
Normal file
22
app/modules/rag/indexing/code/code_text/document_builder.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||||
|
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
|
||||||
|
|
||||||
|
|
||||||
|
class CodeTextDocumentBuilder:
|
||||||
|
def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.CODE_SOURCE_CHUNKS,
|
||||||
|
lang="python",
|
||||||
|
source=source,
|
||||||
|
title=chunk.title,
|
||||||
|
text=chunk.text,
|
||||||
|
span=RagSpan(chunk.start_line, chunk.end_line),
|
||||||
|
metadata={
|
||||||
|
"chunk_index": chunk_index,
|
||||||
|
"chunk_type": chunk.chunk_type,
|
||||||
|
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
|
||||||
|
"artifact_type": "CODE",
|
||||||
|
},
|
||||||
|
)
|
||||||
Binary file not shown.
Binary file not shown.
29
app/modules/rag/indexing/code/edges/document_builder.py
Normal file
29
app/modules/rag/indexing/code/edges/document_builder.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||||
|
from app.modules.rag.indexing.code.edges.extractor import PyEdge
|
||||||
|
|
||||||
|
|
||||||
|
class EdgeDocumentBuilder:
|
||||||
|
def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
|
||||||
|
dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.CODE_DEPENDENCY_GRAPH,
|
||||||
|
lang="python",
|
||||||
|
source=source,
|
||||||
|
title=f"{edge.src_qname}:{edge.edge_type}",
|
||||||
|
text=f"{edge.src_qname} {edge.edge_type} {dst}",
|
||||||
|
span=RagSpan(edge.start_line, edge.end_line),
|
||||||
|
metadata={
|
||||||
|
"edge_id": edge.edge_id,
|
||||||
|
"edge_type": edge.edge_type,
|
||||||
|
"src_symbol_id": edge.src_symbol_id,
|
||||||
|
"src_qname": edge.src_qname,
|
||||||
|
"dst_symbol_id": edge.dst_symbol_id,
|
||||||
|
"dst_ref": edge.dst_ref,
|
||||||
|
"resolution": edge.resolution,
|
||||||
|
"lang_payload": edge.metadata,
|
||||||
|
"artifact_type": "CODE",
|
||||||
|
},
|
||||||
|
links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
|
||||||
|
)
|
||||||
114
app/modules/rag/indexing/code/edges/extractor.py
Normal file
114
app/modules/rag/indexing/code/edges/extractor.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class PyEdge:
|
||||||
|
edge_id: str
|
||||||
|
edge_type: str
|
||||||
|
src_symbol_id: str
|
||||||
|
src_qname: str
|
||||||
|
dst_symbol_id: str | None
|
||||||
|
dst_ref: str | None
|
||||||
|
path: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
resolution: str = "partial"
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class EdgeExtractor:
|
||||||
|
def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
|
||||||
|
if ast_tree is None:
|
||||||
|
return []
|
||||||
|
qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
|
||||||
|
visitor = _EdgeVisitor(path, qname_map)
|
||||||
|
visitor.visit(ast_tree)
|
||||||
|
return visitor.edges
|
||||||
|
|
||||||
|
|
||||||
|
class _EdgeVisitor(ast.NodeVisitor):
|
||||||
|
def __init__(self, path: str, qname_map: dict[str, str]) -> None:
|
||||||
|
self._path = path
|
||||||
|
self._qname_map = qname_map
|
||||||
|
self._scope: list[str] = []
|
||||||
|
self.edges: list[PyEdge] = []
|
||||||
|
|
||||||
|
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||||
|
current = self._enter(node.name)
|
||||||
|
for base in node.bases:
|
||||||
|
self._add_edge("inherits", current, self._name(base), base)
|
||||||
|
self.generic_visit(node)
|
||||||
|
self._scope.pop()
|
||||||
|
|
||||||
|
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||||
|
self._visit_function(node)
|
||||||
|
|
||||||
|
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||||
|
self._visit_function(node)
|
||||||
|
|
||||||
|
def visit_Import(self, node: ast.Import) -> None:
|
||||||
|
current = self._current_qname()
|
||||||
|
if not current:
|
||||||
|
return
|
||||||
|
for item in node.names:
|
||||||
|
self._add_edge("imports", current, item.name, node)
|
||||||
|
|
||||||
|
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||||
|
current = self._current_qname()
|
||||||
|
if not current:
|
||||||
|
return
|
||||||
|
module = node.module or ""
|
||||||
|
for item in node.names:
|
||||||
|
self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
|
||||||
|
|
||||||
|
def _visit_function(self, node) -> None:
|
||||||
|
current = self._enter(node.name)
|
||||||
|
for inner in ast.walk(node):
|
||||||
|
if isinstance(inner, ast.Call):
|
||||||
|
self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
|
||||||
|
self.generic_visit(node)
|
||||||
|
self._scope.pop()
|
||||||
|
|
||||||
|
def _enter(self, name: str) -> str:
|
||||||
|
self._scope.append(name)
|
||||||
|
return self._current_qname() or name
|
||||||
|
|
||||||
|
def _current_qname(self) -> str | None:
|
||||||
|
if not self._scope:
|
||||||
|
return None
|
||||||
|
return ".".join(self._scope)
|
||||||
|
|
||||||
|
def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
|
||||||
|
if not dst_ref:
|
||||||
|
return
|
||||||
|
src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
|
||||||
|
dst_symbol_id = self._qname_map.get(dst_ref)
|
||||||
|
edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
|
||||||
|
self.edges.append(
|
||||||
|
PyEdge(
|
||||||
|
edge_id=edge_id,
|
||||||
|
edge_type=edge_type,
|
||||||
|
src_symbol_id=src_symbol_id,
|
||||||
|
src_qname=src_qname,
|
||||||
|
dst_symbol_id=dst_symbol_id,
|
||||||
|
dst_ref=dst_ref,
|
||||||
|
path=self._path,
|
||||||
|
start_line=int(getattr(node, "lineno", 1)),
|
||||||
|
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||||
|
resolution="resolved" if dst_symbol_id else "partial",
|
||||||
|
metadata=extra or {},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _name(self, node) -> str:
|
||||||
|
if isinstance(node, ast.Name):
|
||||||
|
return node.id
|
||||||
|
if isinstance(node, ast.Attribute):
|
||||||
|
return f"{self._name(node.value)}.{node.attr}"
|
||||||
|
if isinstance(node, ast.Call):
|
||||||
|
return self._name(node.func)
|
||||||
|
return ""
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,26 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||||
|
|
||||||
|
|
||||||
|
class EntrypointDocumentBuilder:
|
||||||
|
def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.CODE_ENTRYPOINTS,
|
||||||
|
lang="python",
|
||||||
|
source=source,
|
||||||
|
title=entrypoint.route_or_command,
|
||||||
|
text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
|
||||||
|
span=RagSpan(entrypoint.start_line, entrypoint.end_line),
|
||||||
|
metadata={
|
||||||
|
"entry_id": entrypoint.entry_id,
|
||||||
|
"entry_type": entrypoint.entry_type,
|
||||||
|
"framework": entrypoint.framework,
|
||||||
|
"route_or_command": entrypoint.route_or_command,
|
||||||
|
"handler_symbol_id": entrypoint.handler_symbol_id,
|
||||||
|
"lang_payload": entrypoint.metadata,
|
||||||
|
"artifact_type": "CODE",
|
||||||
|
},
|
||||||
|
links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
|
||||||
|
)
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||||
|
|
||||||
|
|
||||||
|
class FastApiEntrypointDetector:
|
||||||
|
_METHODS = {"get", "post", "put", "patch", "delete"}
|
||||||
|
|
||||||
|
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||||
|
items: list[Entrypoint] = []
|
||||||
|
for symbol in symbols:
|
||||||
|
decorators = symbol.decorators or []
|
||||||
|
for decorator in decorators:
|
||||||
|
name = decorator.lower()
|
||||||
|
tail = name.split(".")[-1]
|
||||||
|
if tail not in self._METHODS and ".route" not in name:
|
||||||
|
continue
|
||||||
|
route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
|
||||||
|
items.append(
|
||||||
|
Entrypoint(
|
||||||
|
entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||||
|
entry_type="http",
|
||||||
|
framework="fastapi",
|
||||||
|
route_or_command=route,
|
||||||
|
handler_symbol_id=symbol.symbol_id,
|
||||||
|
path=path,
|
||||||
|
start_line=symbol.start_line,
|
||||||
|
end_line=symbol.end_line,
|
||||||
|
metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return items
|
||||||
28
app/modules/rag/indexing/code/entrypoints/flask_detector.py
Normal file
28
app/modules/rag/indexing/code/entrypoints/flask_detector.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||||
|
|
||||||
|
|
||||||
|
class FlaskEntrypointDetector:
|
||||||
|
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||||
|
items: list[Entrypoint] = []
|
||||||
|
for symbol in symbols:
|
||||||
|
for decorator in symbol.decorators or []:
|
||||||
|
lowered = decorator.lower()
|
||||||
|
if ".route" not in lowered:
|
||||||
|
continue
|
||||||
|
items.append(
|
||||||
|
Entrypoint(
|
||||||
|
entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||||
|
entry_type="http",
|
||||||
|
framework="flask",
|
||||||
|
route_or_command=decorator,
|
||||||
|
handler_symbol_id=symbol.symbol_id,
|
||||||
|
path=path,
|
||||||
|
start_line=symbol.start_line,
|
||||||
|
end_line=symbol.end_line,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return items
|
||||||
27
app/modules/rag/indexing/code/entrypoints/registry.py
Normal file
27
app/modules/rag/indexing/code/entrypoints/registry.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class Entrypoint:
|
||||||
|
entry_id: str
|
||||||
|
entry_type: str
|
||||||
|
framework: str
|
||||||
|
route_or_command: str
|
||||||
|
handler_symbol_id: str
|
||||||
|
path: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class EntrypointDetectorRegistry:
|
||||||
|
def __init__(self, detectors: list) -> None:
|
||||||
|
self._detectors = detectors
|
||||||
|
|
||||||
|
def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||||
|
items: list[Entrypoint] = []
|
||||||
|
for detector in self._detectors:
|
||||||
|
items.extend(detector.detect(path=path, symbols=symbols))
|
||||||
|
return items
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||||
|
|
||||||
|
|
||||||
|
class TyperClickEntrypointDetector:
|
||||||
|
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||||
|
items: list[Entrypoint] = []
|
||||||
|
for symbol in symbols:
|
||||||
|
for decorator in symbol.decorators or []:
|
||||||
|
lowered = decorator.lower()
|
||||||
|
if ".command" not in lowered and ".callback" not in lowered:
|
||||||
|
continue
|
||||||
|
framework = "typer" if "typer" in lowered else "click"
|
||||||
|
items.append(
|
||||||
|
Entrypoint(
|
||||||
|
entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||||
|
entry_type="cli",
|
||||||
|
framework=framework,
|
||||||
|
route_or_command=decorator,
|
||||||
|
handler_symbol_id=symbol.symbol_id,
|
||||||
|
path=path,
|
||||||
|
start_line=symbol.start_line,
|
||||||
|
end_line=symbol.end_line,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return items
|
||||||
13
app/modules/rag/indexing/code/file_filter.py
Normal file
13
app/modules/rag/indexing/code/file_filter.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
|
|
||||||
|
class PythonFileFilter:
|
||||||
|
_EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
|
||||||
|
|
||||||
|
def should_index(self, path: str) -> bool:
|
||||||
|
candidate = PurePosixPath(path)
|
||||||
|
if candidate.suffix.lower() != ".py":
|
||||||
|
return False
|
||||||
|
return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)
|
||||||
52
app/modules/rag/indexing/code/pipeline.py
Normal file
52
app/modules/rag/indexing/code/pipeline.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument, RagSource
|
||||||
|
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
|
||||||
|
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
|
||||||
|
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
|
||||||
|
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
|
||||||
|
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
|
||||||
|
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
|
||||||
|
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
|
||||||
|
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
|
||||||
|
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class CodeIndexingPipeline:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._filter = PythonFileFilter()
|
||||||
|
self._chunker = CodeTextChunker()
|
||||||
|
self._code_builder = CodeTextDocumentBuilder()
|
||||||
|
self._parser = PythonAstParser()
|
||||||
|
self._symbols = SymbolExtractor()
|
||||||
|
self._symbol_builder = SymbolDocumentBuilder()
|
||||||
|
self._edges = EdgeExtractor()
|
||||||
|
self._edge_builder = EdgeDocumentBuilder()
|
||||||
|
self._entrypoints = EntrypointDetectorRegistry(
|
||||||
|
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
|
||||||
|
)
|
||||||
|
self._entrypoint_builder = EntrypointDocumentBuilder()
|
||||||
|
|
||||||
|
def supports(self, path: str) -> bool:
|
||||||
|
return self._filter.should_index(path)
|
||||||
|
|
||||||
|
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
||||||
|
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
code_chunks = self._chunker.chunk(path, content)
|
||||||
|
for index, chunk in enumerate(code_chunks):
|
||||||
|
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
|
||||||
|
tree = self._parser.parse_module(content)
|
||||||
|
symbols = self._symbols.extract(path, content, tree)
|
||||||
|
for symbol in symbols:
|
||||||
|
docs.append(self._symbol_builder.build(source, symbol))
|
||||||
|
edges = self._edges.extract(path, tree, symbols)
|
||||||
|
for edge in edges:
|
||||||
|
docs.append(self._edge_builder.build(source, edge))
|
||||||
|
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
|
||||||
|
docs.append(self._entrypoint_builder.build(source, entrypoint))
|
||||||
|
return docs
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
11
app/modules/rag/indexing/code/symbols/ast_parser.py
Normal file
11
app/modules/rag/indexing/code/symbols/ast_parser.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
|
||||||
|
|
||||||
|
class PythonAstParser:
|
||||||
|
def parse_module(self, text: str) -> ast.AST | None:
|
||||||
|
try:
|
||||||
|
return ast.parse(text)
|
||||||
|
except SyntaxError:
|
||||||
|
return None
|
||||||
32
app/modules/rag/indexing/code/symbols/document_builder.py
Normal file
32
app/modules/rag/indexing/code/symbols/document_builder.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||||
|
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
|
||||||
|
|
||||||
|
|
||||||
|
class SymbolDocumentBuilder:
|
||||||
|
def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
|
||||||
|
body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
|
||||||
|
if symbol.docstring:
|
||||||
|
body.append(symbol.docstring.strip())
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.CODE_SYMBOL_CATALOG,
|
||||||
|
lang="python",
|
||||||
|
source=source,
|
||||||
|
title=symbol.qname,
|
||||||
|
text="\n".join(part for part in body if part),
|
||||||
|
span=RagSpan(symbol.start_line, symbol.end_line),
|
||||||
|
metadata={
|
||||||
|
"symbol_id": symbol.symbol_id,
|
||||||
|
"qname": symbol.qname,
|
||||||
|
"kind": symbol.kind,
|
||||||
|
"signature": symbol.signature,
|
||||||
|
"decorators_or_annotations": symbol.decorators,
|
||||||
|
"docstring_or_javadoc": symbol.docstring,
|
||||||
|
"parent_symbol_id": symbol.parent_symbol_id,
|
||||||
|
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
|
||||||
|
"is_entry_candidate": bool(symbol.decorators),
|
||||||
|
"lang_payload": symbol.lang_payload,
|
||||||
|
"artifact_type": "CODE",
|
||||||
|
},
|
||||||
|
)
|
||||||
130
app/modules/rag/indexing/code/symbols/extractor.py
Normal file
130
app/modules/rag/indexing/code/symbols/extractor.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class PySymbol:
|
||||||
|
symbol_id: str
|
||||||
|
qname: str
|
||||||
|
kind: str
|
||||||
|
path: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
signature: str
|
||||||
|
decorators: list[str] = field(default_factory=list)
|
||||||
|
docstring: str | None = None
|
||||||
|
parent_symbol_id: str | None = None
|
||||||
|
lang_payload: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class SymbolExtractor:
|
||||||
|
def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
|
||||||
|
if ast_tree is None:
|
||||||
|
return []
|
||||||
|
collector = _SymbolVisitor(path)
|
||||||
|
collector.visit(ast_tree)
|
||||||
|
return collector.symbols
|
||||||
|
|
||||||
|
|
||||||
|
class _SymbolVisitor(ast.NodeVisitor):
|
||||||
|
def __init__(self, path: str) -> None:
|
||||||
|
self._path = path
|
||||||
|
self._stack: list[tuple[str, str]] = []
|
||||||
|
self.symbols: list[PySymbol] = []
|
||||||
|
|
||||||
|
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||||
|
if self._stack:
|
||||||
|
return
|
||||||
|
module = node.module or ""
|
||||||
|
for item in node.names:
|
||||||
|
local_name = item.asname or item.name
|
||||||
|
imported_name = f"{module}.{item.name}".strip(".")
|
||||||
|
self.symbols.append(
|
||||||
|
PySymbol(
|
||||||
|
symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
|
||||||
|
qname=local_name,
|
||||||
|
kind="const",
|
||||||
|
path=self._path,
|
||||||
|
start_line=int(getattr(node, "lineno", 1)),
|
||||||
|
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||||
|
signature=f"{local_name} = {imported_name}",
|
||||||
|
lang_payload={"imported_from": imported_name, "import_alias": True},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_Import(self, node: ast.Import) -> None:
|
||||||
|
if self._stack:
|
||||||
|
return
|
||||||
|
for item in node.names:
|
||||||
|
local_name = item.asname or item.name
|
||||||
|
self.symbols.append(
|
||||||
|
PySymbol(
|
||||||
|
symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
|
||||||
|
qname=local_name,
|
||||||
|
kind="const",
|
||||||
|
path=self._path,
|
||||||
|
start_line=int(getattr(node, "lineno", 1)),
|
||||||
|
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||||
|
signature=f"import {item.name}",
|
||||||
|
lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||||
|
self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
|
||||||
|
self.generic_visit(node)
|
||||||
|
self._stack.pop()
|
||||||
|
|
||||||
|
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||||
|
self._add_function(node, is_async=False)
|
||||||
|
|
||||||
|
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||||
|
self._add_function(node, is_async=True)
|
||||||
|
|
||||||
|
def _add_function(self, node, *, is_async: bool) -> None:
|
||||||
|
kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
|
||||||
|
self._add_symbol(node, kind, {"async": is_async})
|
||||||
|
self.generic_visit(node)
|
||||||
|
self._stack.pop()
|
||||||
|
|
||||||
|
def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
|
||||||
|
names = [name for _, name in self._stack] + [node.name]
|
||||||
|
qname = ".".join(names)
|
||||||
|
symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
|
||||||
|
signature = self._signature(node)
|
||||||
|
symbol = PySymbol(
|
||||||
|
symbol_id=symbol_id,
|
||||||
|
qname=qname,
|
||||||
|
kind=kind,
|
||||||
|
path=self._path,
|
||||||
|
start_line=int(getattr(node, "lineno", 1)),
|
||||||
|
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||||
|
signature=signature,
|
||||||
|
decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
|
||||||
|
docstring=ast.get_docstring(node),
|
||||||
|
parent_symbol_id=self._stack[-1][1] if self._stack else None,
|
||||||
|
lang_payload=lang_payload,
|
||||||
|
)
|
||||||
|
self.symbols.append(symbol)
|
||||||
|
self._stack.append((kind, qname))
|
||||||
|
|
||||||
|
def _signature(self, node) -> str:
|
||||||
|
if isinstance(node, ast.ClassDef):
|
||||||
|
bases = ", ".join(self._expr_name(base) for base in node.bases)
|
||||||
|
return f"{node.name}({bases})" if bases else node.name
|
||||||
|
args = [arg.arg for arg in getattr(node.args, "args", [])]
|
||||||
|
return f"{node.name}({', '.join(args)})"
|
||||||
|
|
||||||
|
def _expr_name(self, node) -> str:
|
||||||
|
if isinstance(node, ast.Name):
|
||||||
|
return node.id
|
||||||
|
if isinstance(node, ast.Attribute):
|
||||||
|
return f"{self._expr_name(node.value)}.{node.attr}"
|
||||||
|
if isinstance(node, ast.Call):
|
||||||
|
return self._expr_name(node.func)
|
||||||
|
return ast.dump(node, include_attributes=False)
|
||||||
15
app/modules/rag/indexing/common/document_upserter.py
Normal file
15
app/modules/rag/indexing/common/document_upserter.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument
|
||||||
|
from app.modules.rag.persistence.repository import RagRepository
|
||||||
|
|
||||||
|
|
||||||
|
class RagDocumentUpserter:
|
||||||
|
def __init__(self, repository: RagRepository) -> None:
|
||||||
|
self._repository = repository
|
||||||
|
|
||||||
|
def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||||
|
self._repository.replace_documents(rag_session_id, docs)
|
||||||
|
|
||||||
|
def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
|
||||||
|
self._repository.apply_document_changes(rag_session_id, delete_paths, docs)
|
||||||
21
app/modules/rag/indexing/common/report.py
Normal file
21
app/modules/rag/indexing/common/report.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class IndexReport:
|
||||||
|
indexed_files: int = 0
|
||||||
|
failed_files: int = 0
|
||||||
|
cache_hit_files: int = 0
|
||||||
|
cache_miss_files: int = 0
|
||||||
|
documents: int = 0
|
||||||
|
warnings: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def as_tuple(self) -> tuple[int, int, int, int]:
|
||||||
|
return (
|
||||||
|
self.indexed_files,
|
||||||
|
self.failed_files,
|
||||||
|
self.cache_hit_files,
|
||||||
|
self.cache_miss_files,
|
||||||
|
)
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class SectionChunk:
|
||||||
|
section_path: str
|
||||||
|
section_title: str
|
||||||
|
content: str
|
||||||
|
order: int
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownDocChunker:
|
||||||
|
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
|
||||||
|
self._fallback = text_chunker or DocTextChunker()
|
||||||
|
|
||||||
|
def chunk(self, text: str) -> list[SectionChunk]:
|
||||||
|
lines = text.splitlines()
|
||||||
|
sections: list[SectionChunk] = []
|
||||||
|
stack: list[tuple[int, str]] = []
|
||||||
|
current_title = "Document"
|
||||||
|
current_lines: list[str] = []
|
||||||
|
order = 0
|
||||||
|
for line in lines:
|
||||||
|
heading = self._heading(line)
|
||||||
|
if heading is None:
|
||||||
|
current_lines.append(line)
|
||||||
|
continue
|
||||||
|
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||||
|
order += 1
|
||||||
|
level, title = heading
|
||||||
|
stack = [item for item in stack if item[0] < level]
|
||||||
|
stack.append((level, title))
|
||||||
|
current_title = title
|
||||||
|
current_lines = []
|
||||||
|
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||||
|
if sections:
|
||||||
|
return sections
|
||||||
|
chunks = self._fallback.split(text)
|
||||||
|
return [
|
||||||
|
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
|
||||||
|
for index, chunk in enumerate(chunks)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _flush_section(
|
||||||
|
self,
|
||||||
|
sections: list[SectionChunk],
|
||||||
|
stack: list[tuple[int, str]],
|
||||||
|
current_title: str,
|
||||||
|
current_lines: list[str],
|
||||||
|
order: int,
|
||||||
|
) -> None:
|
||||||
|
content = "\n".join(current_lines).strip()
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
titles = [title for _, title in stack] or [current_title]
|
||||||
|
sections.append(
|
||||||
|
SectionChunk(
|
||||||
|
section_path=" > ".join(titles),
|
||||||
|
section_title=titles[-1],
|
||||||
|
content=content,
|
||||||
|
order=order,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _heading(self, line: str) -> tuple[int, str] | None:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped.startswith("#"):
|
||||||
|
return None
|
||||||
|
level = len(stripped) - len(stripped.lstrip("#"))
|
||||||
|
title = stripped[level:].strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
return level, title
|
||||||
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
class DocTextChunker:
|
||||||
|
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
|
||||||
|
self._max_chars = max_chars
|
||||||
|
self._overlap_chars = overlap_chars
|
||||||
|
|
||||||
|
def split(self, text: str) -> list[str]:
|
||||||
|
cleaned = text.strip()
|
||||||
|
if not cleaned:
|
||||||
|
return []
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
while start < len(cleaned):
|
||||||
|
end = min(len(cleaned), start + self._max_chars)
|
||||||
|
chunks.append(cleaned[start:end].strip())
|
||||||
|
if end >= len(cleaned):
|
||||||
|
break
|
||||||
|
start = max(0, end - self._overlap_chars)
|
||||||
|
return [chunk for chunk in chunks if chunk]
|
||||||
18
app/modules/rag/indexing/docs/classifier.py
Normal file
18
app/modules/rag/indexing/docs/classifier.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import DocKind
|
||||||
|
|
||||||
|
|
||||||
|
class DocsClassifier:
|
||||||
|
def classify(self, path: str) -> str:
|
||||||
|
upper = PurePosixPath(path).name.upper()
|
||||||
|
lowered = path.lower()
|
||||||
|
if "runbook" in lowered or upper.startswith("RUNBOOK"):
|
||||||
|
return DocKind.RUNBOOK
|
||||||
|
if upper.startswith("README"):
|
||||||
|
return DocKind.README
|
||||||
|
if "spec" in lowered or "architecture" in lowered:
|
||||||
|
return DocKind.SPEC
|
||||||
|
return DocKind.MISC
|
||||||
115
app/modules/rag/indexing/docs/document_builder.py
Normal file
115
app/modules/rag/indexing/docs/document_builder.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
|
||||||
|
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
|
||||||
|
|
||||||
|
|
||||||
|
class DocsDocumentBuilder:
|
||||||
|
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
|
||||||
|
module_id = str(frontmatter.get("id") or "").strip()
|
||||||
|
module_type = str(frontmatter.get("type") or "").strip()
|
||||||
|
domain = str(frontmatter.get("domain") or "").strip()
|
||||||
|
if not module_id or not module_type or not domain:
|
||||||
|
return None
|
||||||
|
links = frontmatter.get("links") or {}
|
||||||
|
metadata = {
|
||||||
|
"module_id": module_id,
|
||||||
|
"type": module_type,
|
||||||
|
"domain": domain,
|
||||||
|
"status": frontmatter.get("status"),
|
||||||
|
"version": frontmatter.get("version"),
|
||||||
|
"tags": frontmatter.get("tags") or [],
|
||||||
|
"owners": frontmatter.get("owners") or [],
|
||||||
|
"links": links,
|
||||||
|
"source_path": source.path,
|
||||||
|
"summary_text": summary_text[:4000],
|
||||||
|
"doc_kind": doc_kind,
|
||||||
|
}
|
||||||
|
metadata.update({name: links.get(name, []) for name in (
|
||||||
|
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
|
||||||
|
"integrates_with", "emits_events", "consumes_events",
|
||||||
|
)})
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.DOCS_MODULE_CATALOG,
|
||||||
|
source=source,
|
||||||
|
title=module_id,
|
||||||
|
text=summary_text[:4000] or module_id,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
|
||||||
|
module_id = str(frontmatter.get("id") or source.path)
|
||||||
|
metadata = {
|
||||||
|
"module_id": module_id,
|
||||||
|
"type": frontmatter.get("type"),
|
||||||
|
"domain": frontmatter.get("domain"),
|
||||||
|
"tags": frontmatter.get("tags") or [],
|
||||||
|
"section_path": chunk.section_path,
|
||||||
|
"section_title": chunk.section_title,
|
||||||
|
"order": chunk.order,
|
||||||
|
"doc_kind": doc_kind,
|
||||||
|
"source_path": source.path,
|
||||||
|
"artifact_type": "DOCS",
|
||||||
|
}
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.DOCS_SECTION_INDEX,
|
||||||
|
source=source,
|
||||||
|
title=f"{module_id}:{chunk.section_title}",
|
||||||
|
text=chunk.content,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
|
||||||
|
policy_id = str(frontmatter.get("id") or "").strip()
|
||||||
|
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
|
||||||
|
if not policy_id:
|
||||||
|
return None
|
||||||
|
metadata = {
|
||||||
|
"policy_id": policy_id,
|
||||||
|
"applies_to": applies_to,
|
||||||
|
"rules": chunk.content[:4000],
|
||||||
|
"default_behaviors": frontmatter.get("default_behaviors") or [],
|
||||||
|
"doc_kind": doc_kind,
|
||||||
|
"section_path": chunk.section_path,
|
||||||
|
"source_path": source.path,
|
||||||
|
}
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.DOCS_POLICY_INDEX,
|
||||||
|
source=source,
|
||||||
|
title=policy_id,
|
||||||
|
text=chunk.content[:4000],
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def build_fact(
|
||||||
|
self,
|
||||||
|
source: RagSource,
|
||||||
|
*,
|
||||||
|
subject_id: str,
|
||||||
|
predicate: str,
|
||||||
|
obj: str,
|
||||||
|
object_ref: str | None,
|
||||||
|
anchor: str,
|
||||||
|
tags: list[str] | None = None,
|
||||||
|
) -> RagDocument:
|
||||||
|
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
|
||||||
|
metadata = {
|
||||||
|
"fact_id": fact_id,
|
||||||
|
"subject_id": subject_id,
|
||||||
|
"predicate": predicate,
|
||||||
|
"object": obj,
|
||||||
|
"object_ref": object_ref,
|
||||||
|
"anchor": anchor,
|
||||||
|
"tags": tags or [],
|
||||||
|
"source_path": source.path,
|
||||||
|
}
|
||||||
|
return RagDocument(
|
||||||
|
layer=RagLayer.DOCS_FACT_INDEX,
|
||||||
|
source=source,
|
||||||
|
title=f"{subject_id}:{predicate}",
|
||||||
|
text=f"{subject_id} {predicate} {obj}".strip(),
|
||||||
|
metadata=metadata,
|
||||||
|
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
|
||||||
|
)
|
||||||
21
app/modules/rag/indexing/docs/file_filter.py
Normal file
21
app/modules/rag/indexing/docs/file_filter.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
|
|
||||||
|
class DocsFileFilter:
|
||||||
|
_EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
|
||||||
|
_NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
|
||||||
|
_EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
|
||||||
|
|
||||||
|
def should_index(self, path: str) -> bool:
|
||||||
|
candidate = PurePosixPath(path)
|
||||||
|
if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
|
||||||
|
return False
|
||||||
|
if candidate.suffix.lower() in self._EXTENSIONS:
|
||||||
|
return True
|
||||||
|
upper_name = candidate.name.upper()
|
||||||
|
if any(upper_name.startswith(prefix) for prefix in self._NAMES):
|
||||||
|
return True
|
||||||
|
joined = "/".join(candidate.parts).lower()
|
||||||
|
return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))
|
||||||
135
app/modules/rag/indexing/docs/pipeline.py
Normal file
135
app/modules/rag/indexing/docs/pipeline.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument, RagSource
|
||||||
|
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
|
||||||
|
from app.modules.rag.indexing.docs.classifier import DocsClassifier
|
||||||
|
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
|
||||||
|
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
|
||||||
|
|
||||||
|
|
||||||
|
class DocsIndexingPipeline:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._filter = DocsFileFilter()
|
||||||
|
self._classifier = DocsClassifier()
|
||||||
|
self._chunker = MarkdownDocChunker()
|
||||||
|
self._builder = DocsDocumentBuilder()
|
||||||
|
|
||||||
|
def supports(self, path: str) -> bool:
|
||||||
|
return self._filter.should_index(path)
|
||||||
|
|
||||||
|
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
||||||
|
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
||||||
|
frontmatter, body = self._split_frontmatter(content)
|
||||||
|
doc_kind = self._classifier.classify(path)
|
||||||
|
sections = self._chunker.chunk(body)
|
||||||
|
summary_text = self._summary_from_sections(sections)
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
|
||||||
|
if module_doc is not None:
|
||||||
|
docs.append(module_doc)
|
||||||
|
for section in sections:
|
||||||
|
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
|
||||||
|
if str(frontmatter.get("type") or "").strip() == "policy":
|
||||||
|
for section in sections[:1]:
|
||||||
|
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
|
||||||
|
if policy is not None:
|
||||||
|
docs.append(policy)
|
||||||
|
docs.extend(self._extract_facts(source, frontmatter, sections))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
|
||||||
|
if not content.startswith("---\n"):
|
||||||
|
return {}, content
|
||||||
|
_, raw, body = content.split("---", 2)
|
||||||
|
payload = yaml.safe_load(raw) or {}
|
||||||
|
return payload if isinstance(payload, dict) else {}, body.strip()
|
||||||
|
|
||||||
|
def _summary_from_sections(self, sections) -> str:
|
||||||
|
text = "\n\n".join(section.content for section in sections[:2]).strip()
|
||||||
|
return text[:4000]
|
||||||
|
|
||||||
|
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
|
||||||
|
subject_id = str(frontmatter.get("id") or source.path)
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
links = frontmatter.get("links") or {}
|
||||||
|
for predicate, values in links.items():
|
||||||
|
for value in values or []:
|
||||||
|
docs.append(
|
||||||
|
self._builder.build_fact(
|
||||||
|
source,
|
||||||
|
subject_id=subject_id,
|
||||||
|
predicate=predicate,
|
||||||
|
obj=str(value),
|
||||||
|
object_ref=str(value),
|
||||||
|
anchor="frontmatter.links",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for section in sections:
|
||||||
|
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
|
||||||
|
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
||||||
|
lines = [line.strip() for line in content.splitlines() if line.strip()]
|
||||||
|
if len(lines) < 3 or "|" not in lines[0]:
|
||||||
|
return []
|
||||||
|
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
|
||||||
|
if not all(headers):
|
||||||
|
return []
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
for row in lines[2:]:
|
||||||
|
if "|" not in row:
|
||||||
|
continue
|
||||||
|
values = [part.strip() for part in row.strip("|").split("|")]
|
||||||
|
if len(values) != len(headers):
|
||||||
|
continue
|
||||||
|
payload = dict(zip(headers, values))
|
||||||
|
docs.extend(self._facts_from_row(source, subject_id, title, payload))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
|
||||||
|
if "request" in title.lower() or "response" in title.lower():
|
||||||
|
if name:
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
|
||||||
|
if payload.get("required"):
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
|
||||||
|
if payload.get("type"):
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
|
||||||
|
if payload.get("validation"):
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
|
||||||
|
if "error" in title.lower():
|
||||||
|
if payload.get("status"):
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
|
||||||
|
if payload.get("error") or payload.get("code"):
|
||||||
|
error_value = payload.get("error") or payload.get("code")
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
|
||||||
|
if payload.get("client action"):
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
|
||||||
|
if "constraint" in title.lower() and name:
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
for line in content.splitlines():
|
||||||
|
item = line.strip()
|
||||||
|
if not item.startswith(("-", "*", "1.", "2.", "3.")):
|
||||||
|
continue
|
||||||
|
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
|
||||||
|
lowered = normalized.lower()
|
||||||
|
if lowered.startswith("metric:"):
|
||||||
|
predicate = "emits_metric"
|
||||||
|
elif lowered.startswith("event:"):
|
||||||
|
predicate = "emits_analytics_event"
|
||||||
|
elif lowered.startswith("log:"):
|
||||||
|
predicate = "logs_event"
|
||||||
|
else:
|
||||||
|
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
|
||||||
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
|
||||||
|
return docs
|
||||||
Binary file not shown.
Binary file not shown.
189
app/modules/rag/persistence/cache_repository.py
Normal file
189
app/modules/rag/persistence/cache_repository.py
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import EvidenceLink, RagDocument, RagSource, RagSpan
|
||||||
|
from app.modules.rag.contracts.enums import RagLayer
|
||||||
|
|
||||||
|
|
||||||
|
class RagCacheRepository:
|
||||||
|
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||||
|
with self._engine().connect() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end,
|
||||||
|
repo_id, commit_sha, embedding::text AS embedding_txt
|
||||||
|
FROM rag_chunk_cache
|
||||||
|
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
|
||||||
|
ORDER BY chunk_index ASC
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"repo_id": repo_id, "blob_sha": blob_sha},
|
||||||
|
).mappings().fetchall()
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
for row in rows:
|
||||||
|
metadata = self._loads(row.get("metadata_json"))
|
||||||
|
docs.append(
|
||||||
|
RagDocument(
|
||||||
|
layer=str(row["layer"]),
|
||||||
|
lang=row.get("lang"),
|
||||||
|
source=RagSource(
|
||||||
|
repo_id=str(row["repo_id"]),
|
||||||
|
commit_sha=row.get("commit_sha"),
|
||||||
|
path=str(row["path"]),
|
||||||
|
),
|
||||||
|
title=str(row["title"] or row["path"]),
|
||||||
|
text=str(row["content"] or ""),
|
||||||
|
metadata=metadata,
|
||||||
|
links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])],
|
||||||
|
span=RagSpan(row.get("span_start"), row.get("span_end")),
|
||||||
|
embedding=self._parse_vector(str(row["embedding_txt"] or "")),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
|
||||||
|
if not docs:
|
||||||
|
return
|
||||||
|
with self._engine().connect() as conn:
|
||||||
|
first = docs[0].to_record()
|
||||||
|
first_meta = first["metadata"]
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_blob_cache (
|
||||||
|
repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner,
|
||||||
|
system_component, last_modified, staleness_score, layer, lang, metadata_json
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner,
|
||||||
|
:system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json
|
||||||
|
)
|
||||||
|
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
||||||
|
artifact_type = EXCLUDED.artifact_type,
|
||||||
|
section = EXCLUDED.section,
|
||||||
|
doc_id = EXCLUDED.doc_id,
|
||||||
|
doc_version = EXCLUDED.doc_version,
|
||||||
|
owner = EXCLUDED.owner,
|
||||||
|
system_component = EXCLUDED.system_component,
|
||||||
|
last_modified = EXCLUDED.last_modified,
|
||||||
|
staleness_score = EXCLUDED.staleness_score,
|
||||||
|
layer = EXCLUDED.layer,
|
||||||
|
lang = EXCLUDED.lang,
|
||||||
|
metadata_json = EXCLUDED.metadata_json,
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"repo_id": repo_id,
|
||||||
|
"blob_sha": blob_sha,
|
||||||
|
"path": path,
|
||||||
|
"artifact_type": first_meta.get("artifact_type"),
|
||||||
|
"section": first_meta.get("section") or first_meta.get("section_title"),
|
||||||
|
"doc_id": first_meta.get("doc_id"),
|
||||||
|
"doc_version": first_meta.get("doc_version"),
|
||||||
|
"owner": first_meta.get("owner"),
|
||||||
|
"system_component": first_meta.get("system_component"),
|
||||||
|
"last_modified": first_meta.get("last_modified"),
|
||||||
|
"staleness_score": first_meta.get("staleness_score"),
|
||||||
|
"layer": first["layer"],
|
||||||
|
"lang": first["lang"],
|
||||||
|
"metadata_json": json.dumps(first_meta, ensure_ascii=True),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"),
|
||||||
|
{"repo_id": repo_id, "blob_sha": blob_sha},
|
||||||
|
)
|
||||||
|
for idx, doc in enumerate(docs):
|
||||||
|
row = doc.to_record()
|
||||||
|
metadata = row["metadata"]
|
||||||
|
emb = row["embedding"] or []
|
||||||
|
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_chunk_cache (
|
||||||
|
repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title,
|
||||||
|
metadata_json, links_json, span_start, span_end, commit_sha
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer,
|
||||||
|
:lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"repo_id": repo_id,
|
||||||
|
"blob_sha": blob_sha,
|
||||||
|
"chunk_index": idx,
|
||||||
|
"content": row["text"],
|
||||||
|
"embedding": emb_str,
|
||||||
|
"section": metadata.get("section") or metadata.get("section_title"),
|
||||||
|
"layer": row["layer"],
|
||||||
|
"lang": row["lang"],
|
||||||
|
"path": row["path"],
|
||||||
|
"title": row["title"],
|
||||||
|
"metadata_json": json.dumps(metadata, ensure_ascii=True),
|
||||||
|
"links_json": json.dumps(row["links"], ensure_ascii=True),
|
||||||
|
"span_start": row["span_start"],
|
||||||
|
"span_end": row["span_end"],
|
||||||
|
"commit_sha": row["commit_sha"],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def record_repo_cache(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
project_id: str,
|
||||||
|
commit_sha: str | None,
|
||||||
|
changed_files: list[str],
|
||||||
|
summary: str,
|
||||||
|
) -> None:
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
for idx, path in enumerate(changed_files):
|
||||||
|
docs.append(
|
||||||
|
RagDocument(
|
||||||
|
layer=RagLayer.CODE_SOURCE_CHUNKS,
|
||||||
|
lang="python" if path.endswith(".py") else None,
|
||||||
|
source=RagSource(project_id, commit_sha, path),
|
||||||
|
title=path,
|
||||||
|
text=f"repo_webhook:{path}:{summary[:300]}",
|
||||||
|
metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for doc in docs:
|
||||||
|
blob_sha = self._blob_sha(commit_sha, doc.source.path)
|
||||||
|
doc.metadata["blob_sha"] = blob_sha
|
||||||
|
self.cache_documents(project_id, doc.source.path, blob_sha, [doc])
|
||||||
|
|
||||||
|
def _blob_sha(self, commit_sha: str | None, path: str) -> str:
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
def _engine(self):
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
return get_engine()
|
||||||
|
|
||||||
|
def _loads(self, value, default=None):
|
||||||
|
if default is None:
|
||||||
|
default = {}
|
||||||
|
if not value:
|
||||||
|
return default
|
||||||
|
return json.loads(str(value))
|
||||||
|
|
||||||
|
def _parse_vector(self, value: str) -> list[float]:
|
||||||
|
text_value = value.strip()
|
||||||
|
if not text_value:
|
||||||
|
return []
|
||||||
|
if text_value.startswith("[") and text_value.endswith("]"):
|
||||||
|
text_value = text_value[1:-1]
|
||||||
|
if not text_value:
|
||||||
|
return []
|
||||||
|
return [float(part.strip()) for part in text_value.split(",") if part.strip()]
|
||||||
122
app/modules/rag/persistence/document_repository.py
Normal file
122
app/modules/rag/persistence/document_repository.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument
|
||||||
|
|
||||||
|
|
||||||
|
class RagDocumentRepository:
|
||||||
|
def replace_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||||
|
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
||||||
|
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
||||||
|
self.insert_documents(conn, rag_session_id, docs)
|
||||||
|
|
||||||
|
def apply_document_changes(
|
||||||
|
self,
|
||||||
|
conn,
|
||||||
|
rag_session_id: str,
|
||||||
|
delete_paths: list[str],
|
||||||
|
docs: list[RagDocument],
|
||||||
|
) -> None:
|
||||||
|
if delete_paths:
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||||
|
{"sid": rag_session_id, "paths": delete_paths},
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||||
|
{"sid": rag_session_id, "paths": delete_paths},
|
||||||
|
)
|
||||||
|
if not docs:
|
||||||
|
return
|
||||||
|
paths = sorted({doc.source.path for doc in docs})
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||||
|
{"sid": rag_session_id, "paths": paths},
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
||||||
|
{"sid": rag_session_id, "paths": paths},
|
||||||
|
)
|
||||||
|
self.insert_documents(conn, rag_session_id, docs)
|
||||||
|
|
||||||
|
def insert_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||||
|
for doc in docs:
|
||||||
|
row = doc.to_record()
|
||||||
|
metadata = row["metadata"]
|
||||||
|
links = row["links"]
|
||||||
|
emb = row["embedding"] or []
|
||||||
|
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_chunks (
|
||||||
|
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
|
||||||
|
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
|
||||||
|
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
|
||||||
|
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
|
||||||
|
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
|
||||||
|
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
|
||||||
|
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
|
||||||
|
:module_id, :section_path, :doc_kind
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"sid": rag_session_id,
|
||||||
|
"path": row["path"],
|
||||||
|
"chunk_index": int(metadata.get("chunk_index", 0)),
|
||||||
|
"content": row["text"],
|
||||||
|
"emb": emb_str,
|
||||||
|
"artifact_type": metadata.get("artifact_type"),
|
||||||
|
"section": metadata.get("section") or metadata.get("section_title"),
|
||||||
|
"doc_id": metadata.get("doc_id"),
|
||||||
|
"doc_version": metadata.get("doc_version"),
|
||||||
|
"owner": metadata.get("owner"),
|
||||||
|
"system_component": metadata.get("system_component"),
|
||||||
|
"last_modified": metadata.get("last_modified"),
|
||||||
|
"staleness_score": metadata.get("staleness_score"),
|
||||||
|
"rag_doc_id": row["doc_id"],
|
||||||
|
"layer": row["layer"],
|
||||||
|
"lang": row["lang"],
|
||||||
|
"repo_id": row["repo_id"],
|
||||||
|
"commit_sha": row["commit_sha"],
|
||||||
|
"title": row["title"],
|
||||||
|
"metadata_json": json.dumps(metadata, ensure_ascii=True),
|
||||||
|
"links_json": json.dumps(links, ensure_ascii=True),
|
||||||
|
"span_start": row["span_start"],
|
||||||
|
"span_end": row["span_end"],
|
||||||
|
"symbol_id": metadata.get("symbol_id"),
|
||||||
|
"qname": metadata.get("qname"),
|
||||||
|
"kind": metadata.get("kind") or metadata.get("type"),
|
||||||
|
"framework": metadata.get("framework"),
|
||||||
|
"entrypoint_type": metadata.get("entry_type") or metadata.get("entrypoint_type"),
|
||||||
|
"module_id": metadata.get("module_id") or metadata.get("policy_id"),
|
||||||
|
"section_path": metadata.get("section_path"),
|
||||||
|
"doc_kind": metadata.get("doc_kind"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
repo_id = str(row["repo_id"] or "").strip()
|
||||||
|
blob_sha = str(metadata.get("blob_sha") or "").strip()
|
||||||
|
if repo_id and blob_sha:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_session_chunk_map (
|
||||||
|
rag_session_id, repo_id, blob_sha, chunk_index, path
|
||||||
|
) VALUES (:sid, :repo_id, :blob_sha, :chunk_index, :path)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"sid": rag_session_id,
|
||||||
|
"repo_id": repo_id,
|
||||||
|
"blob_sha": blob_sha,
|
||||||
|
"chunk_index": int(metadata.get("chunk_index", 0)),
|
||||||
|
"path": row["path"],
|
||||||
|
},
|
||||||
|
)
|
||||||
95
app/modules/rag/persistence/job_repository.py
Normal file
95
app/modules/rag/persistence/job_repository.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RagJobRow:
|
||||||
|
index_job_id: str
|
||||||
|
rag_session_id: str
|
||||||
|
status: str
|
||||||
|
indexed_files: int
|
||||||
|
failed_files: int
|
||||||
|
cache_hit_files: int
|
||||||
|
cache_miss_files: int
|
||||||
|
error_code: str | None
|
||||||
|
error_desc: str | None
|
||||||
|
error_module: str | None
|
||||||
|
|
||||||
|
|
||||||
|
class RagJobRepository:
|
||||||
|
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
|
||||||
|
VALUES (:jid, :sid, :status)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"jid": index_job_id, "sid": rag_session_id, "status": status},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def update_job(
|
||||||
|
self,
|
||||||
|
index_job_id: str,
|
||||||
|
*,
|
||||||
|
status: str,
|
||||||
|
indexed_files: int,
|
||||||
|
failed_files: int,
|
||||||
|
cache_hit_files: int = 0,
|
||||||
|
cache_miss_files: int = 0,
|
||||||
|
error_code: str | None = None,
|
||||||
|
error_desc: str | None = None,
|
||||||
|
error_module: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
UPDATE rag_index_jobs
|
||||||
|
SET status = :status,
|
||||||
|
indexed_files = :indexed,
|
||||||
|
failed_files = :failed,
|
||||||
|
cache_hit_files = :cache_hit_files,
|
||||||
|
cache_miss_files = :cache_miss_files,
|
||||||
|
error_code = :ecode,
|
||||||
|
error_desc = :edesc,
|
||||||
|
error_module = :emodule,
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
WHERE index_job_id = :jid
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"jid": index_job_id,
|
||||||
|
"status": status,
|
||||||
|
"indexed": indexed_files,
|
||||||
|
"failed": failed_files,
|
||||||
|
"cache_hit_files": cache_hit_files,
|
||||||
|
"cache_miss_files": cache_miss_files,
|
||||||
|
"ecode": error_code,
|
||||||
|
"edesc": error_desc,
|
||||||
|
"emodule": error_module,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
|
||||||
|
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
|
||||||
|
FROM rag_index_jobs
|
||||||
|
WHERE index_job_id = :jid
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"jid": index_job_id},
|
||||||
|
).mappings().fetchone()
|
||||||
|
return RagJobRow(**dict(row)) if row else None
|
||||||
111
app/modules/rag/persistence/query_repository.py
Normal file
111
app/modules/rag/persistence/query_repository.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
|
||||||
|
class RagQueryRepository:
|
||||||
|
def retrieve(
|
||||||
|
self,
|
||||||
|
rag_session_id: str,
|
||||||
|
query_embedding: list[float],
|
||||||
|
*,
|
||||||
|
query_text: str = "",
|
||||||
|
limit: int = 5,
|
||||||
|
layers: list[str] | None = None,
|
||||||
|
path_prefixes: list[str] | None = None,
|
||||||
|
prefer_non_tests: bool = False,
|
||||||
|
) -> list[dict]:
|
||||||
|
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
||||||
|
filters = ["rag_session_id = :sid"]
|
||||||
|
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
|
||||||
|
if layers:
|
||||||
|
filters.append("layer = ANY(:layers)")
|
||||||
|
params["layers"] = layers
|
||||||
|
if path_prefixes:
|
||||||
|
or_filters = []
|
||||||
|
for idx, prefix in enumerate(path_prefixes):
|
||||||
|
key = f"path_{idx}"
|
||||||
|
params[key] = f"{prefix}%"
|
||||||
|
or_filters.append(f"path LIKE :{key}")
|
||||||
|
filters.append("(" + " OR ".join(or_filters) + ")")
|
||||||
|
term_filters = []
|
||||||
|
terms = extract_query_terms(query_text)
|
||||||
|
for idx, term in enumerate(terms):
|
||||||
|
exact_key = f"term_exact_{idx}"
|
||||||
|
prefix_key = f"term_prefix_{idx}"
|
||||||
|
contains_key = f"term_contains_{idx}"
|
||||||
|
params[exact_key] = term
|
||||||
|
params[prefix_key] = f"{term}%"
|
||||||
|
params[contains_key] = f"%{term}%"
|
||||||
|
term_filters.append(
|
||||||
|
"CASE "
|
||||||
|
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
|
||||||
|
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
|
||||||
|
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
|
||||||
|
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
|
||||||
|
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
|
||||||
|
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
|
||||||
|
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
|
||||||
|
"ELSE 100 END"
|
||||||
|
)
|
||||||
|
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
|
||||||
|
test_penalty_sql = (
|
||||||
|
"CASE "
|
||||||
|
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
|
||||||
|
"THEN 1 ELSE 0 END"
|
||||||
|
if prefer_non_tests
|
||||||
|
else "0"
|
||||||
|
)
|
||||||
|
layer_rank_sql = (
|
||||||
|
"CASE "
|
||||||
|
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
|
||||||
|
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
|
||||||
|
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
|
||||||
|
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
|
||||||
|
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
|
||||||
|
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
|
||||||
|
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
|
||||||
|
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
|
||||||
|
"ELSE 10 END"
|
||||||
|
)
|
||||||
|
sql = f"""
|
||||||
|
SELECT path, content, layer, title, metadata_json, span_start, span_end,
|
||||||
|
{lexical_sql} AS lexical_rank,
|
||||||
|
{test_penalty_sql} AS test_penalty,
|
||||||
|
{layer_rank_sql} AS layer_rank,
|
||||||
|
(embedding <=> CAST(:emb AS vector)) AS distance
|
||||||
|
FROM rag_chunks
|
||||||
|
WHERE {' AND '.join(filters)}
|
||||||
|
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
|
||||||
|
LIMIT :lim
|
||||||
|
"""
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||||
|
return [self._row_to_dict(row) for row in rows]
|
||||||
|
|
||||||
|
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
|
||||||
|
filters = ["rag_session_id = :sid"]
|
||||||
|
params: dict = {"sid": rag_session_id, "lim": limit}
|
||||||
|
if layers:
|
||||||
|
filters.append("layer = ANY(:layers)")
|
||||||
|
params["layers"] = layers
|
||||||
|
sql = f"""
|
||||||
|
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||||
|
FROM rag_chunks
|
||||||
|
WHERE {' AND '.join(filters)}
|
||||||
|
ORDER BY id DESC
|
||||||
|
LIMIT :lim
|
||||||
|
"""
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||||
|
return [self._row_to_dict(row) for row in rows]
|
||||||
|
|
||||||
|
def _row_to_dict(self, row) -> dict:
|
||||||
|
data = dict(row)
|
||||||
|
data["metadata"] = json.loads(str(data.pop("metadata_json") or "{}"))
|
||||||
|
return data
|
||||||
82
app/modules/rag/persistence/repository.py
Normal file
82
app/modules/rag/persistence/repository.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument
|
||||||
|
from app.modules.rag.persistence.cache_repository import RagCacheRepository
|
||||||
|
from app.modules.rag.persistence.document_repository import RagDocumentRepository
|
||||||
|
from app.modules.rag.persistence.job_repository import RagJobRepository, RagJobRow
|
||||||
|
from app.modules.rag.persistence.query_repository import RagQueryRepository
|
||||||
|
from app.modules.rag.persistence.schema_repository import RagSchemaRepository
|
||||||
|
from app.modules.rag.persistence.session_repository import RagSessionRepository
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
|
||||||
|
class RagRepository:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._schema = RagSchemaRepository()
|
||||||
|
self._sessions = RagSessionRepository()
|
||||||
|
self._jobs = RagJobRepository()
|
||||||
|
self._documents = RagDocumentRepository()
|
||||||
|
self._cache = RagCacheRepository()
|
||||||
|
self._query = RagQueryRepository()
|
||||||
|
|
||||||
|
def ensure_tables(self) -> None:
|
||||||
|
self._schema.ensure_tables()
|
||||||
|
|
||||||
|
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
||||||
|
self._sessions.upsert_session(rag_session_id, project_id)
|
||||||
|
|
||||||
|
def session_exists(self, rag_session_id: str) -> bool:
|
||||||
|
return self._sessions.session_exists(rag_session_id)
|
||||||
|
|
||||||
|
def get_session(self, rag_session_id: str) -> dict | None:
|
||||||
|
return self._sessions.get_session(rag_session_id)
|
||||||
|
|
||||||
|
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
||||||
|
self._jobs.create_job(index_job_id, rag_session_id, status)
|
||||||
|
|
||||||
|
def update_job(self, index_job_id: str, **kwargs) -> None:
|
||||||
|
self._jobs.update_job(index_job_id, **kwargs)
|
||||||
|
|
||||||
|
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
||||||
|
return self._jobs.get_job(index_job_id)
|
||||||
|
|
||||||
|
def replace_documents(self, rag_session_id: str, docs: list[RagDocument]) -> None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
self._documents.replace_documents(conn, rag_session_id, docs)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def apply_document_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
self._documents.apply_document_changes(conn, rag_session_id, delete_paths, docs)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||||
|
return self._cache.get_cached_documents(repo_id, blob_sha)
|
||||||
|
|
||||||
|
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
|
||||||
|
self._cache.cache_documents(repo_id, path, blob_sha, docs)
|
||||||
|
|
||||||
|
def record_repo_cache(self, **kwargs) -> None:
|
||||||
|
self._cache.record_repo_cache(**kwargs)
|
||||||
|
|
||||||
|
def retrieve(
|
||||||
|
self,
|
||||||
|
rag_session_id: str,
|
||||||
|
query_embedding: list[float],
|
||||||
|
*,
|
||||||
|
query_text: str = "",
|
||||||
|
limit: int = 5,
|
||||||
|
layers: list[str] | None = None,
|
||||||
|
prefer_non_tests: bool = False,
|
||||||
|
) -> list[dict]:
|
||||||
|
return self._query.retrieve(
|
||||||
|
rag_session_id,
|
||||||
|
query_embedding,
|
||||||
|
query_text=query_text,
|
||||||
|
limit=limit,
|
||||||
|
layers=layers,
|
||||||
|
prefer_non_tests=prefer_non_tests,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
|
||||||
|
return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers)
|
||||||
179
app/modules/rag/persistence/schema_repository.py
Normal file
179
app/modules/rag/persistence/schema_repository.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
|
||||||
|
class RagSchemaRepository:
|
||||||
|
def ensure_tables(self) -> None:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.connect() as conn:
|
||||||
|
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_sessions (
|
||||||
|
rag_session_id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(512) NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_index_jobs (
|
||||||
|
index_job_id VARCHAR(64) PRIMARY KEY,
|
||||||
|
rag_session_id VARCHAR(64) NOT NULL,
|
||||||
|
status VARCHAR(16) NOT NULL,
|
||||||
|
indexed_files INTEGER NOT NULL DEFAULT 0,
|
||||||
|
failed_files INTEGER NOT NULL DEFAULT 0,
|
||||||
|
cache_hit_files INTEGER NOT NULL DEFAULT 0,
|
||||||
|
cache_miss_files INTEGER NOT NULL DEFAULT 0,
|
||||||
|
error_code VARCHAR(128) NULL,
|
||||||
|
error_desc TEXT NULL,
|
||||||
|
error_module VARCHAR(64) NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_chunks (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
rag_session_id VARCHAR(64) NOT NULL,
|
||||||
|
path TEXT NOT NULL,
|
||||||
|
chunk_index INTEGER NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
embedding vector NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_blob_cache (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
repo_id VARCHAR(512) NOT NULL,
|
||||||
|
blob_sha VARCHAR(128) NOT NULL,
|
||||||
|
path TEXT NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
repo_id VARCHAR(512) NOT NULL,
|
||||||
|
blob_sha VARCHAR(128) NOT NULL,
|
||||||
|
chunk_index INTEGER NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
embedding vector NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
rag_session_id VARCHAR(64) NOT NULL,
|
||||||
|
repo_id VARCHAR(512) NOT NULL,
|
||||||
|
blob_sha VARCHAR(128) NOT NULL,
|
||||||
|
chunk_index INTEGER NOT NULL,
|
||||||
|
path TEXT NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._ensure_columns(conn)
|
||||||
|
self._ensure_indexes(conn)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def _ensure_columns(self, conn) -> None:
|
||||||
|
for statement in (
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS qname TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS kind TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS framework TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS entrypoint_type TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS module_id TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section_path TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_kind TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS owner TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||||
|
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS path TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS title TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
|
||||||
|
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
|
||||||
|
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0",
|
||||||
|
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0",
|
||||||
|
):
|
||||||
|
conn.execute(text(statement))
|
||||||
|
|
||||||
|
def _ensure_indexes(self, conn) -> None:
|
||||||
|
for statement in (
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer ON rag_chunks (rag_session_id, layer)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer_path ON rag_chunks (rag_session_id, layer, path)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_qname ON rag_chunks (qname)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_symbol_id ON rag_chunks (symbol_id)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_module_id ON rag_chunks (module_id)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc_kind ON rag_chunks (doc_kind)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_entrypoint ON rag_chunks (entrypoint_type, framework)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)",
|
||||||
|
):
|
||||||
|
conn.execute(text(statement))
|
||||||
37
app/modules/rag/persistence/session_repository.py
Normal file
37
app/modules/rag/persistence/session_repository.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.modules.shared.db import get_engine
|
||||||
|
|
||||||
|
|
||||||
|
class RagSessionRepository:
|
||||||
|
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_sessions (rag_session_id, project_id)
|
||||||
|
VALUES (:sid, :pid)
|
||||||
|
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"sid": rag_session_id, "pid": project_id},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def session_exists(self, rag_session_id: str) -> bool:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
|
||||||
|
{"sid": rag_session_id},
|
||||||
|
).fetchone()
|
||||||
|
return bool(row)
|
||||||
|
|
||||||
|
def get_session(self, rag_session_id: str) -> dict | None:
|
||||||
|
with get_engine().connect() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
|
||||||
|
{"sid": rag_session_id},
|
||||||
|
).mappings().fetchone()
|
||||||
|
return dict(row) if row else None
|
||||||
Binary file not shown.
Binary file not shown.
43
app/modules/rag/retrieval/query_router.py
Normal file
43
app/modules/rag/retrieval/query_router.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagLayer, RetrievalMode
|
||||||
|
|
||||||
|
|
||||||
|
class RagQueryRouter:
|
||||||
|
_CODE_HINTS = (
|
||||||
|
"как работает код",
|
||||||
|
"explain code",
|
||||||
|
"explain the code",
|
||||||
|
"по коду",
|
||||||
|
"из кода",
|
||||||
|
"построй документацию по коду",
|
||||||
|
"документацию по коду",
|
||||||
|
"where is implemented",
|
||||||
|
"где реализовано",
|
||||||
|
"endpoint",
|
||||||
|
"handler",
|
||||||
|
"symbol",
|
||||||
|
"function",
|
||||||
|
"class",
|
||||||
|
"method",
|
||||||
|
)
|
||||||
|
|
||||||
|
_DOCS_LAYERS = [
|
||||||
|
RagLayer.DOCS_MODULE_CATALOG,
|
||||||
|
RagLayer.DOCS_FACT_INDEX,
|
||||||
|
RagLayer.DOCS_SECTION_INDEX,
|
||||||
|
RagLayer.DOCS_POLICY_INDEX,
|
||||||
|
]
|
||||||
|
_CODE_LAYERS = [
|
||||||
|
RagLayer.CODE_ENTRYPOINTS,
|
||||||
|
RagLayer.CODE_SYMBOL_CATALOG,
|
||||||
|
RagLayer.CODE_DEPENDENCY_GRAPH,
|
||||||
|
RagLayer.CODE_SOURCE_CHUNKS,
|
||||||
|
]
|
||||||
|
|
||||||
|
def resolve_mode(self, query: str) -> str:
|
||||||
|
lowered = query.lower()
|
||||||
|
return RetrievalMode.CODE if any(hint in lowered for hint in self._CODE_HINTS) else RetrievalMode.DOCS
|
||||||
|
|
||||||
|
def layers_for_mode(self, mode: str) -> list[str]:
|
||||||
|
return list(self._CODE_LAYERS if mode == RetrievalMode.CODE else self._DOCS_LAYERS)
|
||||||
45
app/modules/rag/retrieval/query_terms.py
Normal file
45
app/modules/rag/retrieval/query_terms.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def extract_query_terms(query_text: str) -> list[str]:
|
||||||
|
raw_terms = re.findall(r"[A-Za-z_][A-Za-z0-9_]{2,}", query_text or "")
|
||||||
|
normalized: list[str] = []
|
||||||
|
for term in raw_terms:
|
||||||
|
for variant in _identifier_variants(term):
|
||||||
|
if variant not in normalized:
|
||||||
|
normalized.append(variant)
|
||||||
|
for variant in _intent_variants(query_text):
|
||||||
|
if variant not in normalized:
|
||||||
|
normalized.append(variant)
|
||||||
|
return normalized[:6]
|
||||||
|
|
||||||
|
|
||||||
|
def _identifier_variants(term: str) -> list[str]:
|
||||||
|
lowered = term.lower()
|
||||||
|
variants = [lowered]
|
||||||
|
snake = _camel_to_snake(term)
|
||||||
|
if snake and snake not in variants:
|
||||||
|
variants.append(snake)
|
||||||
|
if lowered.endswith("manager") and len(lowered) > len("manager"):
|
||||||
|
manager_split = lowered[: -len("manager")] + "_manager"
|
||||||
|
if manager_split not in variants:
|
||||||
|
variants.append(manager_split)
|
||||||
|
compact = snake.replace("_", "") if snake else ""
|
||||||
|
if compact and compact not in variants:
|
||||||
|
variants.append(compact)
|
||||||
|
return variants
|
||||||
|
|
||||||
|
|
||||||
|
def _camel_to_snake(term: str) -> str:
|
||||||
|
first = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", term)
|
||||||
|
return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", first).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _intent_variants(query_text: str) -> list[str]:
|
||||||
|
lowered = (query_text or "").lower()
|
||||||
|
variants: list[str] = []
|
||||||
|
if any(token in lowered for token in ("управ", "control", "manage", "management")):
|
||||||
|
variants.extend(["control", "management", "start", "stop", "status"])
|
||||||
|
return variants
|
||||||
197
app/modules/rag/services/rag_service.py
Normal file
197
app/modules/rag/services/rag_service.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from collections.abc import Awaitable, Callable
|
||||||
|
from inspect import isawaitable
|
||||||
|
|
||||||
|
from app.modules.rag.contracts import RagDocument
|
||||||
|
from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline
|
||||||
|
from app.modules.rag.indexing.common.report import IndexReport
|
||||||
|
from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline
|
||||||
|
from app.modules.rag.persistence.repository import RagRepository
|
||||||
|
from app.modules.rag.retrieval.query_router import RagQueryRouter
|
||||||
|
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder
|
||||||
|
|
||||||
|
|
||||||
|
class RagService:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedder: GigaChatEmbedder,
|
||||||
|
repository: RagRepository,
|
||||||
|
chunker=None,
|
||||||
|
) -> None:
|
||||||
|
self._embedder = embedder
|
||||||
|
self._repo = repository
|
||||||
|
self._docs = DocsIndexingPipeline()
|
||||||
|
self._code = CodeIndexingPipeline()
|
||||||
|
self._queries = RagQueryRouter()
|
||||||
|
|
||||||
|
async def index_snapshot(
|
||||||
|
self,
|
||||||
|
rag_session_id: str,
|
||||||
|
files: list[dict],
|
||||||
|
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
|
||||||
|
) -> tuple[int, int, int, int]:
|
||||||
|
report = await self._index_files(rag_session_id, files, progress_cb=progress_cb)
|
||||||
|
self._repo.replace_documents(rag_session_id, report.documents_list)
|
||||||
|
return report.as_tuple()
|
||||||
|
|
||||||
|
async def index_changes(
|
||||||
|
self,
|
||||||
|
rag_session_id: str,
|
||||||
|
changed_files: list[dict],
|
||||||
|
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
|
||||||
|
) -> tuple[int, int, int, int]:
|
||||||
|
delete_paths: list[str] = []
|
||||||
|
upserts: list[dict] = []
|
||||||
|
for item in changed_files:
|
||||||
|
if str(item.get("op")) == "delete":
|
||||||
|
delete_paths.append(str(item.get("path", "")))
|
||||||
|
else:
|
||||||
|
upserts.append(item)
|
||||||
|
report = await self._index_files(rag_session_id, upserts, progress_cb=progress_cb)
|
||||||
|
self._repo.apply_document_changes(rag_session_id, delete_paths, report.documents_list)
|
||||||
|
return report.as_tuple()
|
||||||
|
|
||||||
|
async def retrieve(self, rag_session_id: str, query: str) -> list[dict]:
|
||||||
|
mode = self._queries.resolve_mode(query)
|
||||||
|
layers = self._queries.layers_for_mode(mode)
|
||||||
|
prefer_non_tests = mode == "code" and "test" not in query.lower() and "тест" not in query.lower()
|
||||||
|
try:
|
||||||
|
query_embedding = self._embedder.embed([query])[0]
|
||||||
|
rows = self._repo.retrieve(
|
||||||
|
rag_session_id,
|
||||||
|
query_embedding,
|
||||||
|
query_text=query,
|
||||||
|
limit=8,
|
||||||
|
layers=layers,
|
||||||
|
prefer_non_tests=prefer_non_tests,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=layers)
|
||||||
|
if not rows and mode != "docs":
|
||||||
|
rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=self._queries.layers_for_mode("docs"))
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"source": row["path"],
|
||||||
|
"content": row["content"],
|
||||||
|
"layer": row.get("layer"),
|
||||||
|
"title": row.get("title"),
|
||||||
|
"metadata": row.get("metadata", {}),
|
||||||
|
"score": row.get("distance"),
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
async def _index_files(
|
||||||
|
self,
|
||||||
|
rag_session_id: str,
|
||||||
|
files: list[dict],
|
||||||
|
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
|
||||||
|
) -> "_PipelineReport":
|
||||||
|
total_files = len(files)
|
||||||
|
report = _PipelineReport()
|
||||||
|
repo_id = self._resolve_repo_id(rag_session_id)
|
||||||
|
for index, file in enumerate(files, start=1):
|
||||||
|
path = str(file.get("path", ""))
|
||||||
|
try:
|
||||||
|
blob_sha = self._blob_sha(file)
|
||||||
|
cached = await asyncio.to_thread(self._repo.get_cached_documents, repo_id, blob_sha)
|
||||||
|
if cached:
|
||||||
|
report.documents_list.extend(self._with_file_metadata(cached, file, repo_id, blob_sha))
|
||||||
|
report.cache_hit_files += 1
|
||||||
|
else:
|
||||||
|
built = self._build_documents(repo_id, path, file)
|
||||||
|
embedded = await asyncio.to_thread(self._embed_documents, built, file, repo_id, blob_sha)
|
||||||
|
report.documents_list.extend(embedded)
|
||||||
|
await asyncio.to_thread(self._repo.cache_documents, repo_id, path, blob_sha, embedded)
|
||||||
|
report.cache_miss_files += 1
|
||||||
|
report.indexed_files += 1
|
||||||
|
except Exception as exc:
|
||||||
|
report.failed_files += 1
|
||||||
|
report.warnings.append(f"{path}: {exc}")
|
||||||
|
await self._notify_progress(progress_cb, index, total_files, path)
|
||||||
|
report.documents = len(report.documents_list)
|
||||||
|
return report
|
||||||
|
|
||||||
|
def _build_documents(self, repo_id: str, path: str, file: dict) -> list[RagDocument]:
|
||||||
|
content = str(file.get("content") or "")
|
||||||
|
commit_sha = file.get("commit_sha")
|
||||||
|
docs: list[RagDocument] = []
|
||||||
|
if self._docs.supports(path):
|
||||||
|
docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
|
||||||
|
if self._code.supports(path):
|
||||||
|
docs.extend(self._code.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
|
||||||
|
if not docs:
|
||||||
|
docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _embed_documents(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||||
|
if not docs:
|
||||||
|
return []
|
||||||
|
batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16")))
|
||||||
|
metadata = self._document_metadata(file, repo_id, blob_sha)
|
||||||
|
for doc in docs:
|
||||||
|
doc.metadata.update(metadata)
|
||||||
|
for start in range(0, len(docs), batch_size):
|
||||||
|
batch = docs[start : start + batch_size]
|
||||||
|
vectors = self._embedder.embed([doc.text for doc in batch])
|
||||||
|
for doc, vector in zip(batch, vectors):
|
||||||
|
doc.embedding = vector
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _with_file_metadata(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]:
|
||||||
|
metadata = self._document_metadata(file, repo_id, blob_sha)
|
||||||
|
for doc in docs:
|
||||||
|
doc.metadata.update(metadata)
|
||||||
|
doc.source.repo_id = repo_id
|
||||||
|
doc.source.path = str(file.get("path", doc.source.path))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _document_metadata(self, file: dict, repo_id: str, blob_sha: str) -> dict:
|
||||||
|
return {
|
||||||
|
"blob_sha": blob_sha,
|
||||||
|
"repo_id": repo_id,
|
||||||
|
"artifact_type": file.get("artifact_type"),
|
||||||
|
"section": file.get("section"),
|
||||||
|
"doc_id": file.get("doc_id"),
|
||||||
|
"doc_version": file.get("doc_version"),
|
||||||
|
"owner": file.get("owner"),
|
||||||
|
"system_component": file.get("system_component"),
|
||||||
|
"last_modified": file.get("last_modified"),
|
||||||
|
"staleness_score": file.get("staleness_score"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _resolve_repo_id(self, rag_session_id: str) -> str:
|
||||||
|
session = self._repo.get_session(rag_session_id)
|
||||||
|
if not session:
|
||||||
|
return rag_session_id
|
||||||
|
return str(session.get("project_id") or rag_session_id)
|
||||||
|
|
||||||
|
def _blob_sha(self, file: dict) -> str:
|
||||||
|
raw = str(file.get("content_hash") or "").strip()
|
||||||
|
if raw:
|
||||||
|
return raw
|
||||||
|
content = str(file.get("content") or "")
|
||||||
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
async def _notify_progress(
|
||||||
|
self,
|
||||||
|
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None,
|
||||||
|
current_file_index: int,
|
||||||
|
total_files: int,
|
||||||
|
current_file_name: str,
|
||||||
|
) -> None:
|
||||||
|
if not progress_cb:
|
||||||
|
return
|
||||||
|
result = progress_cb(current_file_index, total_files, current_file_name)
|
||||||
|
if isawaitable(result):
|
||||||
|
await result
|
||||||
|
|
||||||
|
|
||||||
|
class _PipelineReport(IndexReport):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.documents_list: list[RagDocument] = []
|
||||||
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|||||||
from fastapi import APIRouter, Request
|
from fastapi import APIRouter, Request
|
||||||
|
|
||||||
from app.modules.agent.story_context_repository import StoryContextRepository
|
from app.modules.agent.story_context_repository import StoryContextRepository
|
||||||
|
from app.modules.rag.persistence.repository import RagRepository
|
||||||
from app.modules.rag_repo.webhook_service import RepoWebhookService
|
from app.modules.rag_repo.webhook_service import RepoWebhookService
|
||||||
from app.modules.rag_session.repository import RagRepository
|
|
||||||
|
|
||||||
|
|
||||||
class RagRepoModule:
|
class RagRepoModule:
|
||||||
|
|||||||
@@ -1,660 +1,4 @@
|
|||||||
from __future__ import annotations
|
from app.modules.rag.persistence.job_repository import RagJobRow
|
||||||
|
from app.modules.rag.persistence.repository import RagRepository
|
||||||
|
|
||||||
from dataclasses import dataclass
|
__all__ = ["RagJobRow", "RagRepository"]
|
||||||
import hashlib
|
|
||||||
from sqlalchemy import text
|
|
||||||
|
|
||||||
from app.modules.shared.db import get_engine
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RagJobRow:
|
|
||||||
index_job_id: str
|
|
||||||
rag_session_id: str
|
|
||||||
status: str
|
|
||||||
indexed_files: int
|
|
||||||
failed_files: int
|
|
||||||
cache_hit_files: int
|
|
||||||
cache_miss_files: int
|
|
||||||
error_code: str | None
|
|
||||||
error_desc: str | None
|
|
||||||
error_module: str | None
|
|
||||||
|
|
||||||
|
|
||||||
class RagRepository:
|
|
||||||
def ensure_tables(self) -> None:
|
|
||||||
engine = get_engine()
|
|
||||||
with engine.connect() as conn:
|
|
||||||
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_sessions (
|
|
||||||
rag_session_id VARCHAR(64) PRIMARY KEY,
|
|
||||||
project_id VARCHAR(512) NOT NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_index_jobs (
|
|
||||||
index_job_id VARCHAR(64) PRIMARY KEY,
|
|
||||||
rag_session_id VARCHAR(64) NOT NULL,
|
|
||||||
status VARCHAR(16) NOT NULL,
|
|
||||||
indexed_files INTEGER NOT NULL DEFAULT 0,
|
|
||||||
failed_files INTEGER NOT NULL DEFAULT 0,
|
|
||||||
cache_hit_files INTEGER NOT NULL DEFAULT 0,
|
|
||||||
cache_miss_files INTEGER NOT NULL DEFAULT 0,
|
|
||||||
error_code VARCHAR(128) NULL,
|
|
||||||
error_desc TEXT NULL,
|
|
||||||
error_module VARCHAR(64) NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_chunks (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
rag_session_id VARCHAR(64) NOT NULL,
|
|
||||||
path TEXT NOT NULL,
|
|
||||||
chunk_index INTEGER NOT NULL,
|
|
||||||
content TEXT NOT NULL,
|
|
||||||
embedding vector NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL"))
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
ALTER TABLE rag_chunks
|
|
||||||
ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
ALTER TABLE rag_chunks
|
|
||||||
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_blob_cache (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
repo_id VARCHAR(512) NOT NULL,
|
|
||||||
blob_sha VARCHAR(128) NOT NULL,
|
|
||||||
path TEXT NOT NULL,
|
|
||||||
artifact_type VARCHAR(16) NULL,
|
|
||||||
section TEXT NULL,
|
|
||||||
doc_id TEXT NULL,
|
|
||||||
doc_version TEXT NULL,
|
|
||||||
owner TEXT NULL,
|
|
||||||
system_component TEXT NULL,
|
|
||||||
last_modified TIMESTAMPTZ NULL,
|
|
||||||
staleness_score DOUBLE PRECISION NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
repo_id VARCHAR(512) NOT NULL,
|
|
||||||
blob_sha VARCHAR(128) NOT NULL,
|
|
||||||
chunk_index INTEGER NOT NULL,
|
|
||||||
content TEXT NOT NULL,
|
|
||||||
embedding vector NULL,
|
|
||||||
section TEXT NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
rag_session_id VARCHAR(64) NOT NULL,
|
|
||||||
repo_id VARCHAR(512) NOT NULL,
|
|
||||||
blob_sha VARCHAR(128) NOT NULL,
|
|
||||||
chunk_index INTEGER NOT NULL,
|
|
||||||
path TEXT NOT NULL,
|
|
||||||
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)"))
|
|
||||||
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0"))
|
|
||||||
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0"))
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_sessions (rag_session_id, project_id)
|
|
||||||
VALUES (:sid, :pid)
|
|
||||||
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"sid": rag_session_id, "pid": project_id},
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def session_exists(self, rag_session_id: str) -> bool:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
row = conn.execute(
|
|
||||||
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
|
|
||||||
{"sid": rag_session_id},
|
|
||||||
).fetchone()
|
|
||||||
return bool(row)
|
|
||||||
|
|
||||||
def get_session(self, rag_session_id: str) -> dict | None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
row = conn.execute(
|
|
||||||
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
|
|
||||||
{"sid": rag_session_id},
|
|
||||||
).mappings().fetchone()
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
|
|
||||||
VALUES (:jid, :sid, :status)
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"jid": index_job_id, "sid": rag_session_id, "status": status},
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def update_job(
|
|
||||||
self,
|
|
||||||
index_job_id: str,
|
|
||||||
*,
|
|
||||||
status: str,
|
|
||||||
indexed_files: int,
|
|
||||||
failed_files: int,
|
|
||||||
cache_hit_files: int = 0,
|
|
||||||
cache_miss_files: int = 0,
|
|
||||||
error_code: str | None = None,
|
|
||||||
error_desc: str | None = None,
|
|
||||||
error_module: str | None = None,
|
|
||||||
) -> None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
UPDATE rag_index_jobs
|
|
||||||
SET status = :status,
|
|
||||||
indexed_files = :indexed,
|
|
||||||
failed_files = :failed,
|
|
||||||
cache_hit_files = :cache_hit_files,
|
|
||||||
cache_miss_files = :cache_miss_files,
|
|
||||||
error_code = :ecode,
|
|
||||||
error_desc = :edesc,
|
|
||||||
error_module = :emodule,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
WHERE index_job_id = :jid
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"jid": index_job_id,
|
|
||||||
"status": status,
|
|
||||||
"indexed": indexed_files,
|
|
||||||
"failed": failed_files,
|
|
||||||
"cache_hit_files": cache_hit_files,
|
|
||||||
"cache_miss_files": cache_miss_files,
|
|
||||||
"ecode": error_code,
|
|
||||||
"edesc": error_desc,
|
|
||||||
"emodule": error_module,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
row = conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
|
|
||||||
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
|
|
||||||
FROM rag_index_jobs
|
|
||||||
WHERE index_job_id = :jid
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"jid": index_job_id},
|
|
||||||
).mappings().fetchone()
|
|
||||||
if not row:
|
|
||||||
return None
|
|
||||||
return RagJobRow(**dict(row))
|
|
||||||
|
|
||||||
def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
|
||||||
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
|
||||||
self._insert_chunks(conn, rag_session_id, items)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
if delete_paths:
|
|
||||||
conn.execute(
|
|
||||||
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
||||||
{"sid": rag_session_id, "paths": delete_paths},
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
||||||
{"sid": rag_session_id, "paths": delete_paths},
|
|
||||||
)
|
|
||||||
if upserts:
|
|
||||||
paths = sorted({str(x["path"]) for x in upserts})
|
|
||||||
conn.execute(
|
|
||||||
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
||||||
{"sid": rag_session_id, "paths": paths},
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
||||||
{"sid": rag_session_id, "paths": paths},
|
|
||||||
)
|
|
||||||
self._insert_chunks(conn, rag_session_id, upserts)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
rows = conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
SELECT chunk_index, content, embedding::text AS embedding_txt, section
|
|
||||||
FROM rag_chunk_cache
|
|
||||||
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
|
|
||||||
ORDER BY chunk_index ASC
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"repo_id": repo_id, "blob_sha": blob_sha},
|
|
||||||
).mappings().fetchall()
|
|
||||||
output: list[dict] = []
|
|
||||||
for row in rows:
|
|
||||||
output.append(
|
|
||||||
{
|
|
||||||
"chunk_index": int(row["chunk_index"]),
|
|
||||||
"content": str(row["content"] or ""),
|
|
||||||
"embedding": self._parse_vector(str(row["embedding_txt"] or "")),
|
|
||||||
"section": row.get("section"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def record_repo_cache(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
project_id: str,
|
|
||||||
commit_sha: str | None,
|
|
||||||
changed_files: list[str],
|
|
||||||
summary: str,
|
|
||||||
) -> None:
|
|
||||||
repo_session_id = f"repo:{project_id}"
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
for path in changed_files:
|
|
||||||
key = f"{commit_sha or 'no-commit'}:{path}"
|
|
||||||
blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest()
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_blob_cache (
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
path,
|
|
||||||
artifact_type,
|
|
||||||
section
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
:path,
|
|
||||||
:artifact_type,
|
|
||||||
:section
|
|
||||||
)
|
|
||||||
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"repo_id": project_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"path": path,
|
|
||||||
"artifact_type": "CODE",
|
|
||||||
"section": "repo_webhook",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_chunk_cache (
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
chunk_index,
|
|
||||||
content,
|
|
||||||
embedding,
|
|
||||||
section
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
0,
|
|
||||||
:content,
|
|
||||||
NULL,
|
|
||||||
:section
|
|
||||||
)
|
|
||||||
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
|
|
||||||
content = EXCLUDED.content,
|
|
||||||
section = EXCLUDED.section,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"repo_id": project_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"content": f"repo_webhook:{path}:{summary[:300]}",
|
|
||||||
"section": "repo_webhook",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_session_chunk_map (
|
|
||||||
rag_session_id,
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
chunk_index,
|
|
||||||
path
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:rag_session_id,
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
0,
|
|
||||||
:path
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"rag_session_id": repo_session_id,
|
|
||||||
"repo_id": project_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"path": path,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None:
|
|
||||||
if not items:
|
|
||||||
return
|
|
||||||
meta = items[0]
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_blob_cache (
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
path,
|
|
||||||
artifact_type,
|
|
||||||
section,
|
|
||||||
doc_id,
|
|
||||||
doc_version,
|
|
||||||
owner,
|
|
||||||
system_component,
|
|
||||||
last_modified,
|
|
||||||
staleness_score
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
:path,
|
|
||||||
:artifact_type,
|
|
||||||
:section,
|
|
||||||
:doc_id,
|
|
||||||
:doc_version,
|
|
||||||
:owner,
|
|
||||||
:system_component,
|
|
||||||
:last_modified,
|
|
||||||
:staleness_score
|
|
||||||
)
|
|
||||||
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
|
||||||
artifact_type = EXCLUDED.artifact_type,
|
|
||||||
section = EXCLUDED.section,
|
|
||||||
doc_id = EXCLUDED.doc_id,
|
|
||||||
doc_version = EXCLUDED.doc_version,
|
|
||||||
owner = EXCLUDED.owner,
|
|
||||||
system_component = EXCLUDED.system_component,
|
|
||||||
last_modified = EXCLUDED.last_modified,
|
|
||||||
staleness_score = EXCLUDED.staleness_score,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"repo_id": repo_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"path": path,
|
|
||||||
"artifact_type": meta.get("artifact_type"),
|
|
||||||
"section": meta.get("section"),
|
|
||||||
"doc_id": meta.get("doc_id"),
|
|
||||||
"doc_version": meta.get("doc_version"),
|
|
||||||
"owner": meta.get("owner"),
|
|
||||||
"system_component": meta.get("system_component"),
|
|
||||||
"last_modified": meta.get("last_modified"),
|
|
||||||
"staleness_score": meta.get("staleness_score"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
for item in items:
|
|
||||||
emb = item.get("embedding") or []
|
|
||||||
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_chunk_cache (
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
chunk_index,
|
|
||||||
content,
|
|
||||||
embedding,
|
|
||||||
section
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
:chunk_index,
|
|
||||||
:content,
|
|
||||||
CAST(:embedding AS vector),
|
|
||||||
:section
|
|
||||||
)
|
|
||||||
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
|
|
||||||
content = EXCLUDED.content,
|
|
||||||
embedding = EXCLUDED.embedding,
|
|
||||||
section = EXCLUDED.section,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"repo_id": repo_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"chunk_index": int(item["chunk_index"]),
|
|
||||||
"content": item["content"],
|
|
||||||
"embedding": emb_str,
|
|
||||||
"section": item.get("section"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]:
|
|
||||||
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
rows = conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
SELECT path, content
|
|
||||||
FROM rag_chunks
|
|
||||||
WHERE rag_session_id = :sid
|
|
||||||
ORDER BY embedding <=> CAST(:emb AS vector)
|
|
||||||
LIMIT :lim
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"sid": rag_session_id, "emb": emb, "lim": limit},
|
|
||||||
).mappings().fetchall()
|
|
||||||
return [dict(x) for x in rows]
|
|
||||||
|
|
||||||
def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]:
|
|
||||||
with get_engine().connect() as conn:
|
|
||||||
rows = conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
SELECT path, content
|
|
||||||
FROM rag_chunks
|
|
||||||
WHERE rag_session_id = :sid
|
|
||||||
ORDER BY id DESC
|
|
||||||
LIMIT :lim
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{"sid": rag_session_id, "lim": limit},
|
|
||||||
).mappings().fetchall()
|
|
||||||
return [dict(x) for x in rows]
|
|
||||||
|
|
||||||
def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None:
|
|
||||||
for item in items:
|
|
||||||
emb = item.get("embedding") or []
|
|
||||||
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_chunks (
|
|
||||||
rag_session_id,
|
|
||||||
path,
|
|
||||||
chunk_index,
|
|
||||||
content,
|
|
||||||
embedding,
|
|
||||||
artifact_type,
|
|
||||||
section,
|
|
||||||
doc_id,
|
|
||||||
doc_version,
|
|
||||||
owner,
|
|
||||||
system_component,
|
|
||||||
last_modified,
|
|
||||||
staleness_score,
|
|
||||||
created_at,
|
|
||||||
updated_at
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
:sid,
|
|
||||||
:path,
|
|
||||||
:idx,
|
|
||||||
:content,
|
|
||||||
CAST(:emb AS vector),
|
|
||||||
:artifact_type,
|
|
||||||
:section,
|
|
||||||
:doc_id,
|
|
||||||
:doc_version,
|
|
||||||
:owner,
|
|
||||||
:system_component,
|
|
||||||
:last_modified,
|
|
||||||
:staleness_score,
|
|
||||||
CURRENT_TIMESTAMP,
|
|
||||||
CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"sid": rag_session_id,
|
|
||||||
"path": item["path"],
|
|
||||||
"idx": int(item["chunk_index"]),
|
|
||||||
"content": item["content"],
|
|
||||||
"emb": emb_str,
|
|
||||||
"artifact_type": item.get("artifact_type"),
|
|
||||||
"section": item.get("section"),
|
|
||||||
"doc_id": item.get("doc_id"),
|
|
||||||
"doc_version": item.get("doc_version"),
|
|
||||||
"owner": item.get("owner"),
|
|
||||||
"system_component": item.get("system_component"),
|
|
||||||
"last_modified": item.get("last_modified"),
|
|
||||||
"staleness_score": item.get("staleness_score"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
repo_id = str(item.get("repo_id") or "").strip()
|
|
||||||
blob_sha = str(item.get("blob_sha") or "").strip()
|
|
||||||
if repo_id and blob_sha:
|
|
||||||
conn.execute(
|
|
||||||
text(
|
|
||||||
"""
|
|
||||||
INSERT INTO rag_session_chunk_map (
|
|
||||||
rag_session_id,
|
|
||||||
repo_id,
|
|
||||||
blob_sha,
|
|
||||||
chunk_index,
|
|
||||||
path
|
|
||||||
) VALUES (
|
|
||||||
:sid,
|
|
||||||
:repo_id,
|
|
||||||
:blob_sha,
|
|
||||||
:chunk_index,
|
|
||||||
:path
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
{
|
|
||||||
"sid": rag_session_id,
|
|
||||||
"repo_id": repo_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
"chunk_index": int(item["chunk_index"]),
|
|
||||||
"path": item["path"],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
def _parse_vector(self, value: str) -> list[float]:
|
|
||||||
text_value = value.strip()
|
|
||||||
if not text_value:
|
|
||||||
return []
|
|
||||||
if text_value.startswith("[") and text_value.endswith("]"):
|
|
||||||
text_value = text_value[1:-1]
|
|
||||||
if not text_value:
|
|
||||||
return []
|
|
||||||
return [float(part.strip()) for part in text_value.split(",") if part.strip()]
|
|
||||||
|
|||||||
@@ -1,211 +1,3 @@
|
|||||||
import asyncio
|
from app.modules.rag.services.rag_service import RagService
|
||||||
import hashlib
|
|
||||||
import os
|
|
||||||
from collections.abc import Awaitable, Callable
|
|
||||||
from inspect import isawaitable
|
|
||||||
|
|
||||||
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder
|
__all__ = ["RagService"]
|
||||||
from app.modules.rag_session.repository import RagRepository
|
|
||||||
from app.modules.rag_session.retrieval.chunker import TextChunker
|
|
||||||
|
|
||||||
|
|
||||||
class RagService:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
embedder: GigaChatEmbedder,
|
|
||||||
repository: RagRepository,
|
|
||||||
chunker: TextChunker | None = None,
|
|
||||||
) -> None:
|
|
||||||
self._embedder = embedder
|
|
||||||
self._repo = repository
|
|
||||||
self._chunker = chunker or TextChunker()
|
|
||||||
|
|
||||||
async def index_snapshot(
|
|
||||||
self,
|
|
||||||
rag_session_id: str,
|
|
||||||
files: list[dict],
|
|
||||||
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
|
|
||||||
) -> tuple[int, int, int, int]:
|
|
||||||
total_files = len(files)
|
|
||||||
indexed_files = 0
|
|
||||||
failed_files = 0
|
|
||||||
cache_hit_files = 0
|
|
||||||
cache_miss_files = 0
|
|
||||||
all_chunks: list[dict] = []
|
|
||||||
repo_id = self._resolve_repo_id(rag_session_id)
|
|
||||||
for index, file in enumerate(files, start=1):
|
|
||||||
path = str(file.get("path", ""))
|
|
||||||
try:
|
|
||||||
blob_sha = self._blob_sha(file)
|
|
||||||
cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha)
|
|
||||||
if cached:
|
|
||||||
all_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached))
|
|
||||||
cache_hit_files += 1
|
|
||||||
else:
|
|
||||||
chunks = self._build_chunks_for_file(file)
|
|
||||||
embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha)
|
|
||||||
all_chunks.extend(embedded_chunks)
|
|
||||||
await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks)
|
|
||||||
cache_miss_files += 1
|
|
||||||
indexed_files += 1
|
|
||||||
except Exception:
|
|
||||||
failed_files += 1
|
|
||||||
await self._notify_progress(progress_cb, index, total_files, path)
|
|
||||||
await asyncio.to_thread(self._repo.replace_chunks, rag_session_id, all_chunks)
|
|
||||||
return indexed_files, failed_files, cache_hit_files, cache_miss_files
|
|
||||||
|
|
||||||
async def index_changes(
|
|
||||||
self,
|
|
||||||
rag_session_id: str,
|
|
||||||
changed_files: list[dict],
|
|
||||||
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
|
|
||||||
) -> tuple[int, int, int, int]:
|
|
||||||
total_files = len(changed_files)
|
|
||||||
indexed_files = 0
|
|
||||||
failed_files = 0
|
|
||||||
cache_hit_files = 0
|
|
||||||
cache_miss_files = 0
|
|
||||||
delete_paths: list[str] = []
|
|
||||||
upsert_chunks: list[dict] = []
|
|
||||||
repo_id = self._resolve_repo_id(rag_session_id)
|
|
||||||
|
|
||||||
for index, file in enumerate(changed_files, start=1):
|
|
||||||
path = str(file.get("path", ""))
|
|
||||||
op = str(file.get("op", ""))
|
|
||||||
try:
|
|
||||||
if op == "delete":
|
|
||||||
delete_paths.append(path)
|
|
||||||
indexed_files += 1
|
|
||||||
await self._notify_progress(progress_cb, index, total_files, path)
|
|
||||||
continue
|
|
||||||
if op == "upsert" and file.get("content") is not None:
|
|
||||||
blob_sha = self._blob_sha(file)
|
|
||||||
cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha)
|
|
||||||
if cached:
|
|
||||||
upsert_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached))
|
|
||||||
cache_hit_files += 1
|
|
||||||
else:
|
|
||||||
chunks = self._build_chunks_for_file(file)
|
|
||||||
embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha)
|
|
||||||
upsert_chunks.extend(embedded_chunks)
|
|
||||||
await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks)
|
|
||||||
cache_miss_files += 1
|
|
||||||
indexed_files += 1
|
|
||||||
await self._notify_progress(progress_cb, index, total_files, path)
|
|
||||||
continue
|
|
||||||
failed_files += 1
|
|
||||||
except Exception:
|
|
||||||
failed_files += 1
|
|
||||||
await self._notify_progress(progress_cb, index, total_files, path)
|
|
||||||
|
|
||||||
await asyncio.to_thread(
|
|
||||||
self._repo.apply_changes,
|
|
||||||
rag_session_id,
|
|
||||||
delete_paths,
|
|
||||||
upsert_chunks,
|
|
||||||
)
|
|
||||||
return indexed_files, failed_files, cache_hit_files, cache_miss_files
|
|
||||||
|
|
||||||
async def retrieve(self, rag_session_id: str, query: str) -> list[dict]:
|
|
||||||
try:
|
|
||||||
query_embedding = self._embedder.embed([query])[0]
|
|
||||||
rows = self._repo.retrieve(rag_session_id, query_embedding, limit=5)
|
|
||||||
except Exception:
|
|
||||||
rows = self._repo.fallback_chunks(rag_session_id, limit=5)
|
|
||||||
return [{"source": row["path"], "content": row["content"]} for row in rows]
|
|
||||||
|
|
||||||
def _build_chunks_for_file(self, file: dict) -> list[tuple[str, int, str]]:
|
|
||||||
path = str(file.get("path", ""))
|
|
||||||
content = str(file.get("content", ""))
|
|
||||||
output: list[tuple[str, int, str]] = []
|
|
||||||
for idx, chunk in enumerate(self._chunker.chunk(content)):
|
|
||||||
output.append((path, idx, chunk))
|
|
||||||
return output
|
|
||||||
|
|
||||||
def _embed_chunks(self, raw_chunks: list[tuple[str, int, str]], file: dict, repo_id: str, blob_sha: str) -> list[dict]:
|
|
||||||
if not raw_chunks:
|
|
||||||
return []
|
|
||||||
batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16")))
|
|
||||||
metadata = self._chunk_metadata(file)
|
|
||||||
|
|
||||||
indexed: list[dict] = []
|
|
||||||
for i in range(0, len(raw_chunks), batch_size):
|
|
||||||
batch = raw_chunks[i : i + batch_size]
|
|
||||||
texts = [x[2] for x in batch]
|
|
||||||
vectors = self._embedder.embed(texts)
|
|
||||||
for (path, chunk_index, content), vector in zip(batch, vectors):
|
|
||||||
indexed.append(
|
|
||||||
{
|
|
||||||
"path": path,
|
|
||||||
"chunk_index": chunk_index,
|
|
||||||
"content": content,
|
|
||||||
"embedding": vector,
|
|
||||||
"repo_id": repo_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
**metadata,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return indexed
|
|
||||||
|
|
||||||
def _build_cached_items(
|
|
||||||
self,
|
|
||||||
path: str,
|
|
||||||
file: dict,
|
|
||||||
repo_id: str,
|
|
||||||
blob_sha: str,
|
|
||||||
cached: list[dict],
|
|
||||||
) -> list[dict]:
|
|
||||||
metadata = self._chunk_metadata(file)
|
|
||||||
output: list[dict] = []
|
|
||||||
for item in cached:
|
|
||||||
output.append(
|
|
||||||
{
|
|
||||||
"path": path,
|
|
||||||
"chunk_index": int(item["chunk_index"]),
|
|
||||||
"content": str(item["content"]),
|
|
||||||
"embedding": item.get("embedding") or [],
|
|
||||||
"repo_id": repo_id,
|
|
||||||
"blob_sha": blob_sha,
|
|
||||||
**metadata,
|
|
||||||
"section": item.get("section") or metadata.get("section"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def _resolve_repo_id(self, rag_session_id: str) -> str:
|
|
||||||
session = self._repo.get_session(rag_session_id)
|
|
||||||
if not session:
|
|
||||||
return rag_session_id
|
|
||||||
return str(session.get("project_id") or rag_session_id)
|
|
||||||
|
|
||||||
def _blob_sha(self, file: dict) -> str:
|
|
||||||
raw = str(file.get("content_hash") or "").strip()
|
|
||||||
if raw:
|
|
||||||
return raw
|
|
||||||
content = str(file.get("content") or "")
|
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
def _chunk_metadata(self, file: dict) -> dict:
|
|
||||||
return {
|
|
||||||
"artifact_type": file.get("artifact_type"),
|
|
||||||
"section": file.get("section"),
|
|
||||||
"doc_id": file.get("doc_id"),
|
|
||||||
"doc_version": file.get("doc_version"),
|
|
||||||
"owner": file.get("owner"),
|
|
||||||
"system_component": file.get("system_component"),
|
|
||||||
"last_modified": file.get("last_modified"),
|
|
||||||
"staleness_score": file.get("staleness_score"),
|
|
||||||
}
|
|
||||||
|
|
||||||
async def _notify_progress(
|
|
||||||
self,
|
|
||||||
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None,
|
|
||||||
current_file_index: int,
|
|
||||||
total_files: int,
|
|
||||||
current_file_name: str,
|
|
||||||
) -> None:
|
|
||||||
if not progress_cb:
|
|
||||||
return
|
|
||||||
result = progress_cb(current_file_index, total_files, current_file_name)
|
|
||||||
if isawaitable(result):
|
|
||||||
await result
|
|
||||||
|
|||||||
Binary file not shown.
131
tests/agent/orchestrator/test_explain_actions.py
Normal file
131
tests/agent/orchestrator/test_explain_actions.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
from app.modules.agent.engine.orchestrator.actions.explain_actions import ExplainActions
|
||||||
|
from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext
|
||||||
|
from app.modules.agent.engine.orchestrator.models import (
|
||||||
|
ExecutionPlan,
|
||||||
|
OutputContract,
|
||||||
|
RoutingMeta,
|
||||||
|
Scenario,
|
||||||
|
TaskConstraints,
|
||||||
|
TaskSpec,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _ctx(rag_items: list[dict]) -> ExecutionContext:
|
||||||
|
task = TaskSpec(
|
||||||
|
task_id="task-1",
|
||||||
|
dialog_session_id="dialog-1",
|
||||||
|
rag_session_id="rag-1",
|
||||||
|
user_message="Объясни по коду как работает task_processor",
|
||||||
|
scenario=Scenario.EXPLAIN_PART,
|
||||||
|
routing=RoutingMeta(domain_id="project", process_id="qa", confidence=0.9, reason="test"),
|
||||||
|
constraints=TaskConstraints(),
|
||||||
|
output_contract=OutputContract(result_type="answer"),
|
||||||
|
metadata={
|
||||||
|
"rag_items": rag_items,
|
||||||
|
"rag_context": "",
|
||||||
|
"confluence_context": "",
|
||||||
|
"files_map": {},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
plan = ExecutionPlan(
|
||||||
|
plan_id="plan-1",
|
||||||
|
task_id="task-1",
|
||||||
|
scenario=Scenario.EXPLAIN_PART,
|
||||||
|
template_id="tpl",
|
||||||
|
template_version="1",
|
||||||
|
steps=[],
|
||||||
|
)
|
||||||
|
return ExecutionContext(task=task, plan=plan, graph_resolver=lambda *_: None, graph_invoker=lambda *_: {})
|
||||||
|
|
||||||
|
|
||||||
|
def test_explain_actions_switch_to_code_profile_when_code_layers_present() -> None:
|
||||||
|
ctx = _ctx(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"source": "app/task_processor.py",
|
||||||
|
"layer": "C1_SYMBOL_CATALOG",
|
||||||
|
"title": "task_processor.process_task",
|
||||||
|
"content": "function task_processor.process_task(task)",
|
||||||
|
"metadata": {"qname": "task_processor.process_task", "kind": "function"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "app/task_processor.py",
|
||||||
|
"layer": "C2_DEPENDENCY_GRAPH",
|
||||||
|
"title": "task_processor.process_task:calls",
|
||||||
|
"content": "task_processor.process_task calls queue.publish",
|
||||||
|
"metadata": {"edge_type": "calls"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actions = ExplainActions()
|
||||||
|
|
||||||
|
actions.collect_sources(ctx)
|
||||||
|
actions.extract_logic(ctx)
|
||||||
|
actions.summarize(ctx)
|
||||||
|
|
||||||
|
sources = ctx.artifacts.get_content("sources", {})
|
||||||
|
assert sources["source_profile"] == "code"
|
||||||
|
answer = str(ctx.artifacts.get_content("final_answer", ""))
|
||||||
|
assert "кодовых слоев индекса" not in answer
|
||||||
|
assert "CodeRAG" not in answer
|
||||||
|
assert "app/task_processor.py" in answer
|
||||||
|
assert "requirements/docs context" not in answer
|
||||||
|
|
||||||
|
|
||||||
|
def test_explain_actions_add_code_details_block() -> None:
|
||||||
|
ctx = _ctx(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"source": "src/config_manager/__init__.py",
|
||||||
|
"layer": "C1_SYMBOL_CATALOG",
|
||||||
|
"title": "ConfigManager",
|
||||||
|
"content": "const ConfigManager\nConfigManager = config_manager.v2.ConfigManagerV2",
|
||||||
|
"metadata": {
|
||||||
|
"qname": "ConfigManager",
|
||||||
|
"kind": "const",
|
||||||
|
"lang_payload": {"imported_from": "v2.ConfigManagerV2", "import_alias": True},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "src/config_manager/v2/control/base.py",
|
||||||
|
"layer": "C1_SYMBOL_CATALOG",
|
||||||
|
"title": "ControlChannel",
|
||||||
|
"content": "class ControlChannel\nControlChannel(ABC)",
|
||||||
|
"metadata": {"qname": "ControlChannel", "kind": "class"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "src/config_manager/v2/core/control_bridge.py",
|
||||||
|
"layer": "C1_SYMBOL_CATALOG",
|
||||||
|
"title": "ControlChannelBridge",
|
||||||
|
"content": "class ControlChannelBridge\nПредоставляет halt и status как обработчики start/stop/status",
|
||||||
|
"metadata": {"qname": "ControlChannelBridge", "kind": "class"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "src/config_manager/v2/core/control_bridge.py",
|
||||||
|
"layer": "C2_DEPENDENCY_GRAPH",
|
||||||
|
"title": "ControlChannelBridge.on_start:calls",
|
||||||
|
"content": "ControlChannelBridge.on_start calls self._start_runtime",
|
||||||
|
"metadata": {"src_qname": "ControlChannelBridge.on_start", "dst_ref": "self._start_runtime"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "src/config_manager/v2/__init__.py",
|
||||||
|
"layer": "C0_SOURCE_CHUNKS",
|
||||||
|
"title": "src/config_manager/v2/__init__.py:1-6",
|
||||||
|
"content": '"""Контракт: управление через API (config.yaml, секция management)."""',
|
||||||
|
"metadata": {},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actions = ExplainActions()
|
||||||
|
|
||||||
|
actions.collect_sources(ctx)
|
||||||
|
actions.extract_logic(ctx)
|
||||||
|
actions.summarize(ctx)
|
||||||
|
|
||||||
|
answer = str(ctx.artifacts.get_content("final_answer", ""))
|
||||||
|
assert "### Что видно по коду" in answer
|
||||||
|
assert "ConfigManager` в проекте доступен как alias" in answer
|
||||||
|
assert "ControlChannelBridge.on_start" in answer
|
||||||
|
assert "### Где смотреть в проекте" in answer
|
||||||
|
assert "В индексе нет точного символа" not in answer
|
||||||
|
assert "отдельный интерфейс управления" in answer
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
57
tests/rag/test_code_indexing_pipeline.py
Normal file
57
tests/rag/test_code_indexing_pipeline.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
from app.modules.rag.contracts.enums import RagLayer
|
||||||
|
from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def test_code_pipeline_builds_source_symbols_edges_and_entrypoints() -> None:
|
||||||
|
pipeline = CodeIndexingPipeline()
|
||||||
|
content = """
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
class UserService:
|
||||||
|
def get_user(self, user_id):
|
||||||
|
return user_id
|
||||||
|
|
||||||
|
@router.get("/users/{user_id}")
|
||||||
|
async def get_user(user_id: str):
|
||||||
|
service = UserService()
|
||||||
|
return service.get_user(user_id)
|
||||||
|
"""
|
||||||
|
docs = pipeline.index_file(
|
||||||
|
repo_id="acme/proj",
|
||||||
|
commit_sha="abc123",
|
||||||
|
path="app/api/users.py",
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = {doc.layer for doc in docs}
|
||||||
|
assert RagLayer.CODE_SOURCE_CHUNKS in layers
|
||||||
|
assert RagLayer.CODE_SYMBOL_CATALOG in layers
|
||||||
|
assert RagLayer.CODE_DEPENDENCY_GRAPH in layers
|
||||||
|
assert RagLayer.CODE_ENTRYPOINTS in layers
|
||||||
|
|
||||||
|
symbol_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["kind"] == "function")
|
||||||
|
assert "get_user" in symbol_doc.metadata["qname"]
|
||||||
|
|
||||||
|
edge_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_DEPENDENCY_GRAPH)
|
||||||
|
assert edge_doc.metadata["edge_type"] in {"calls", "imports", "inherits"}
|
||||||
|
|
||||||
|
entry_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_ENTRYPOINTS)
|
||||||
|
assert entry_doc.metadata["framework"] == "fastapi"
|
||||||
|
|
||||||
|
|
||||||
|
def test_code_pipeline_indexes_import_alias_as_symbol() -> None:
|
||||||
|
pipeline = CodeIndexingPipeline()
|
||||||
|
content = "from .v2 import ConfigManagerV2 as ConfigManager\n"
|
||||||
|
|
||||||
|
docs = pipeline.index_file(
|
||||||
|
repo_id="acme/proj",
|
||||||
|
commit_sha="abc123",
|
||||||
|
path="src/config_manager/__init__.py",
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
||||||
|
alias_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["qname"] == "ConfigManager")
|
||||||
|
assert alias_doc.metadata["kind"] == "const"
|
||||||
|
assert alias_doc.metadata["lang_payload"]["import_alias"] is True
|
||||||
63
tests/rag/test_docs_indexing_pipeline.py
Normal file
63
tests/rag/test_docs_indexing_pipeline.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
from app.modules.rag.contracts.enums import RagLayer
|
||||||
|
from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def test_docs_pipeline_builds_catalog_facts_sections_and_policy() -> None:
|
||||||
|
pipeline = DocsIndexingPipeline()
|
||||||
|
content = """---
|
||||||
|
id: api.billing.create_invoice
|
||||||
|
type: policy
|
||||||
|
domain: billing
|
||||||
|
links:
|
||||||
|
calls_api:
|
||||||
|
- api.billing.validate_invoice
|
||||||
|
tags: [billing]
|
||||||
|
status: active
|
||||||
|
---
|
||||||
|
# Create Invoice
|
||||||
|
|
||||||
|
## Spec Summary
|
||||||
|
|
||||||
|
Creates an invoice in billing.
|
||||||
|
|
||||||
|
## Request Contract
|
||||||
|
|
||||||
|
| field | type | required | validation |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| amount | decimal | yes | > 0 |
|
||||||
|
|
||||||
|
## Error Matrix
|
||||||
|
|
||||||
|
| status | error | client action |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| 400 | invalid_amount | fix request |
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
|
||||||
|
- metric: billing.invoice.created
|
||||||
|
- rule: amount must be positive
|
||||||
|
"""
|
||||||
|
docs = pipeline.index_file(
|
||||||
|
repo_id="acme/proj",
|
||||||
|
commit_sha="abc123",
|
||||||
|
path="docs/billing/create_invoice.md",
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = {doc.layer for doc in docs}
|
||||||
|
assert RagLayer.DOCS_MODULE_CATALOG in layers
|
||||||
|
assert RagLayer.DOCS_FACT_INDEX in layers
|
||||||
|
assert RagLayer.DOCS_SECTION_INDEX in layers
|
||||||
|
assert RagLayer.DOCS_POLICY_INDEX in layers
|
||||||
|
|
||||||
|
module_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_MODULE_CATALOG)
|
||||||
|
assert module_doc.metadata["module_id"] == "api.billing.create_invoice"
|
||||||
|
assert module_doc.metadata["type"] == "policy"
|
||||||
|
|
||||||
|
fact_texts = [doc.text for doc in docs if doc.layer == RagLayer.DOCS_FACT_INDEX]
|
||||||
|
assert any("calls_api" in text for text in fact_texts)
|
||||||
|
assert any("has_field" in text for text in fact_texts)
|
||||||
|
assert any("returns_error" in text for text in fact_texts)
|
||||||
|
|
||||||
|
section_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_SECTION_INDEX)
|
||||||
|
assert section_doc.metadata["section_path"]
|
||||||
12
tests/rag/test_query_router.py
Normal file
12
tests/rag/test_query_router.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from app.modules.rag.contracts.enums import RetrievalMode
|
||||||
|
from app.modules.rag.retrieval.query_router import RagQueryRouter
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_router_uses_docs_by_default() -> None:
|
||||||
|
router = RagQueryRouter()
|
||||||
|
assert router.resolve_mode("Какие есть требования по биллингу?") == RetrievalMode.DOCS
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_router_switches_to_code_on_explicit_code_requests() -> None:
|
||||||
|
router = RagQueryRouter()
|
||||||
|
assert router.resolve_mode("Объясни как работает код endpoint create invoice") == RetrievalMode.CODE
|
||||||
9
tests/rag/test_query_terms.py
Normal file
9
tests/rag/test_query_terms.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_query_terms_from_code_question() -> None:
|
||||||
|
terms = extract_query_terms("Объясни по коду как можно управлять COnfigmanager?")
|
||||||
|
|
||||||
|
assert "configmanager" in terms
|
||||||
|
assert "config_manager" in terms
|
||||||
|
assert "control" in terms
|
||||||
Reference in New Issue
Block a user