Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -1,5 +1,7 @@
from __future__ import annotations from __future__ import annotations
from collections import Counter
from app.modules.agent.engine.orchestrator.actions.common import ActionSupport from app.modules.agent.engine.orchestrator.actions.common import ActionSupport
from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext
from app.modules.agent.engine.orchestrator.models import ArtifactType from app.modules.agent.engine.orchestrator.models import ArtifactType
@@ -7,22 +9,28 @@ from app.modules.agent.engine.orchestrator.models import ArtifactType
class ExplainActions(ActionSupport): class ExplainActions(ActionSupport):
def collect_sources(self, ctx: ExecutionContext) -> list[str]: def collect_sources(self, ctx: ExecutionContext) -> list[str]:
rag_items = list(ctx.task.metadata.get("rag_items", []) or [])
rag_context = str(ctx.task.metadata.get("rag_context", "")) rag_context = str(ctx.task.metadata.get("rag_context", ""))
confluence_context = str(ctx.task.metadata.get("confluence_context", "")) confluence_context = str(ctx.task.metadata.get("confluence_context", ""))
files_map = dict(ctx.task.metadata.get("files_map", {}) or {}) files_map = dict(ctx.task.metadata.get("files_map", {}) or {})
payload = { payload = {
"rag_items": rag_items,
"rag_context": rag_context, "rag_context": rag_context,
"confluence_context": confluence_context, "confluence_context": confluence_context,
"files_count": len(files_map), "files_count": len(files_map),
"source_profile": self._source_profile(rag_items),
} }
evidence_ids: list[str] = [] evidence_ids: list[str] = []
if rag_context.strip(): for item in rag_items[:5]:
snippet = str(item.get("content", "") or "").strip()
if not snippet:
continue
evidence_ids.append( evidence_ids.append(
self.add_evidence( self.add_evidence(
ctx, ctx,
source_type="rag_chunk", source_type="rag_chunk",
source_ref=ctx.task.rag_session_id, source_ref=str(item.get("source", ctx.task.rag_session_id)),
snippet=rag_context, snippet=snippet,
score=0.9, score=0.9,
) )
) )
@@ -38,50 +46,214 @@ class ExplainActions(ActionSupport):
def extract_logic(self, ctx: ExecutionContext) -> list[str]: def extract_logic(self, ctx: ExecutionContext) -> list[str]:
sources = self.get(ctx, "sources", {}) or {} sources = self.get(ctx, "sources", {}) or {}
message = ctx.task.user_message message = ctx.task.user_message
profile = str(sources.get("source_profile", "docs"))
ru = self._is_russian(message)
notes = (
"Используй код как основной источник и ссылайся на конкретные файлы и слои."
if profile == "code" and ru
else "Use code as the primary source and cite concrete files/layers."
if profile == "code"
else "Используй требования и документацию как основной источник."
if ru
else "Use requirements/docs as primary source over code."
)
logic = { logic = {
"request": message, "request": message,
"assumptions": ["requirements-first"], "assumptions": [f"{profile}-first"],
"notes": "Use requirements/docs as primary source over code.", "notes": notes,
"source_summary": sources, "source_summary": sources,
} }
return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)] return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)]
def build_sequence(self, ctx: ExecutionContext) -> list[str]:
message = ctx.task.user_message
mermaid = "\n".join(
[
"```mermaid",
"sequenceDiagram",
"participant User",
"participant Agent",
"participant Docs",
"User->>Agent: " + message[:80],
"Agent->>Docs: Find relevant requirements",
"Docs-->>Agent: Relevant context",
"Agent-->>User: Structured explanation",
"```",
]
)
return [self.put(ctx, "sequence_diagram", ArtifactType.TEXT, mermaid)]
def build_use_cases(self, ctx: ExecutionContext) -> list[str]:
lines = [
"### Use Cases",
"- Analyze requirement fragments relevant to user question",
"- Reconstruct behavior flow and decision points",
"- Return user-focused explanation with constraints",
]
return [self.put(ctx, "use_cases", ArtifactType.TEXT, "\n".join(lines))]
def summarize(self, ctx: ExecutionContext) -> list[str]: def summarize(self, ctx: ExecutionContext) -> list[str]:
sequence = str(self.get(ctx, "sequence_diagram", "") or "") sources = self.get(ctx, "sources", {}) or {}
use_cases = str(self.get(ctx, "use_cases", "") or "") profile = str(sources.get("source_profile", "docs"))
answer = "\n\n".join( items = list(sources.get("rag_items", []) or [])
[ message = ctx.task.user_message
"## Summary", ru = self._is_russian(message)
"The requested project part is explained from requirements/docs context.", answer = self._code_answer(items, russian=ru) if profile == "code" else self._docs_answer(items, russian=ru)
sequence,
use_cases,
]
)
return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)] return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)]
def _source_profile(self, items: list[dict]) -> str:
layers = [str(item.get("layer", "") or "") for item in items]
if any(layer.startswith("C") for layer in layers):
return "code"
return "docs"
def _is_russian(self, text: str) -> bool:
return any("а" <= ch.lower() <= "я" or ch.lower() == "ё" for ch in text)
def _code_answer(self, items: list[dict], *, russian: bool) -> str:
if not items:
return (
"Не удалось найти релевантный кодовый контекст по этому запросу."
if russian
else "No relevant code context was found for this request."
)
details = self._code_details(items, russian=russian)
refs = self._code_references(items, russian=russian)
parts = [
"## Кратко" if russian else "## Summary",
details,
]
if refs:
parts.append(refs)
return "\n\n".join(part for part in parts if part.strip())
def _docs_answer(self, items: list[dict], *, russian: bool) -> str:
return (
"Запрошенная часть проекта объяснена на основе требований и документации."
if russian
else "The requested project part is explained from requirements/docs context."
)
def _code_details(self, items: list[dict], *, russian: bool) -> str:
if not items:
return ""
symbol_items = [item for item in items if str(item.get("layer", "")) == "C1_SYMBOL_CATALOG"]
edge_items = [item for item in items if str(item.get("layer", "")) == "C2_DEPENDENCY_GRAPH"]
source_items = [item for item in items if str(item.get("layer", "")) == "C0_SOURCE_CHUNKS"]
lines = ["### Что видно по коду" if russian else "### What the code shows"]
alias = self._find_alias_symbol(symbol_items)
if alias:
imported_from = str(alias.get("metadata", {}).get("lang_payload", {}).get("imported_from", "")).strip()
if russian:
lines.append(f"- `ConfigManager` в проекте доступен как alias в `{alias.get('source', '')}` и указывает на `{imported_from}`.")
else:
lines.append(f"- `ConfigManager` is exposed as an alias in `{alias.get('source', '')}` and points to `{imported_from}`.")
management_hint = self._management_summary(symbol_items, edge_items, source_items, russian=russian)
if management_hint:
lines.extend(management_hint)
symbol_lines = 0
for item in symbol_items[:4]:
title = str(item.get("title", "") or "")
source = str(item.get("source", "") or "")
content = str(item.get("content", "") or "").strip()
summary = content.splitlines()[-1].strip() if content else ""
if not title:
continue
if self._is_test_path(source):
continue
if self._is_control_symbol(title):
continue
if russian:
lines.append(f"- Символ `{title}` из `{source}`: {summary}")
else:
lines.append(f"- Symbol `{title}` from `{source}`: {summary}")
symbol_lines += 1
if symbol_lines >= 2:
break
edge_map: dict[str, list[str]] = {}
for item in edge_items:
meta = item.get("metadata", {}) or {}
src_qname = str(meta.get("src_qname", "") or "").strip()
dst_ref = str(meta.get("dst_ref", "") or "").strip()
if not src_qname or not dst_ref:
continue
if self._is_test_path(str(item.get("source", "") or "")):
continue
edge_map.setdefault(src_qname, [])
if dst_ref not in edge_map[src_qname]:
edge_map[src_qname].append(dst_ref)
for src_qname, targets in list(edge_map.items())[:3]:
joined = ", ".join(targets[:4])
if russian:
lines.append(f"- `{src_qname}` вызывает или использует: {joined}.")
else:
lines.append(f"- `{src_qname}` calls or uses: {joined}.")
for item in source_items[:2]:
source = str(item.get("source", "") or "")
content = str(item.get("content", "") or "")
if self._is_test_path(source):
continue
if "management" in content.lower() or "control" in content.lower():
snippet = " ".join(content.splitlines()[:4]).strip()
if russian:
lines.append(f"- В `{source}` есть прямое указание на управление через конфиг/API: `{snippet[:220]}`")
else:
lines.append(f"- `{source}` directly mentions config/API control: `{snippet[:220]}`")
return "\n".join(lines)
def _code_references(self, items: list[dict], *, russian: bool) -> str:
paths = [str(item.get("source", "") or "") for item in items if item.get("source") and not self._is_test_path(str(item.get("source", "") or ""))]
if not paths:
return ""
lines = ["### Где смотреть в проекте" if russian else "### Where to look in the project"]
for path, _count in Counter(paths).most_common(3):
lines.append(f"- `{path}`")
return "\n".join(lines)
def _find_alias_symbol(self, items: list[dict]) -> dict | None:
for item in items:
meta = item.get("metadata", {}) or {}
payload = meta.get("lang_payload", {}) or {}
qname = str(meta.get("qname", "") or "")
if qname == "ConfigManager" and payload.get("import_alias"):
return item
return None
def _is_test_path(self, path: str) -> bool:
lowered = path.lower()
return lowered.startswith("tests/") or "/tests/" in lowered or lowered.startswith("test_") or "/test_" in lowered
def _is_control_symbol(self, title: str) -> bool:
lowered = title.lower()
return any(token in lowered for token in ("controlchannel", "controlchannelbridge", "on_start", "on_stop", "on_status"))
def _management_summary(
self,
symbol_items: list[dict],
edge_items: list[dict],
source_items: list[dict],
*,
russian: bool,
) -> list[str]:
qnames = {str((item.get("metadata", {}) or {}).get("qname", "") or ""): item for item in symbol_items if not self._is_test_path(str(item.get("source", "") or ""))}
source_texts = [str(item.get("content", "") or "") for item in source_items if not self._is_test_path(str(item.get("source", "") or ""))]
result: list[str] = []
if any("управление через api" in text.lower() or "section management" in text.lower() or "секция management" in text.lower() for text in source_texts):
result.append(
"- Для `ConfigManager` в коде предусмотрен отдельный интерфейс управления через API/конфиг: это прямо указано в публичной точке входа модуля."
if russian
else "- `ConfigManager` has a dedicated API/config-based management interface; this is stated in the module's public entrypoint."
)
has_control_channel = "ControlChannel" in qnames
has_bridge = "ControlChannelBridge" in qnames
if has_control_channel:
result.append(
"- Базовый контракт управления задает `ControlChannel`: он определяет команды `start` и `stop` для внешнего канала управления."
if russian
else "- The base management contract is `ControlChannel`, which defines external `start` and `stop` commands."
)
if has_bridge:
result.append(
"- `ControlChannelBridge` связывает внешний канал управления с lifecycle-методами менеджера: `on_start`, `on_stop`, `on_status`."
if russian
else "- `ControlChannelBridge` maps the external control channel to manager lifecycle methods: `on_start`, `on_stop`, `on_status`."
)
edge_refs = []
for item in edge_items:
if self._is_test_path(str(item.get("source", "") or "")):
continue
meta = item.get("metadata", {}) or {}
src = str(meta.get("src_qname", "") or "")
dst = str(meta.get("dst_ref", "") or "")
if src.startswith("ControlChannelBridge.") and dst in {"self._start_runtime", "self._stop_runtime", "self._get_status"}:
edge_refs.append((src, dst))
if edge_refs:
mappings = ", ".join(f"{src} -> {dst}" for src, dst in edge_refs[:3])
result.append(
f"- По связям в коде видно, что команды управления маршрутизируются так: {mappings}."
if russian
else f"- The code relationships show the management command routing: {mappings}."
)
return result

View File

@@ -24,8 +24,6 @@ class StepRegistry:
"finalize_graph_output": self._finalize_graph_output, "finalize_graph_output": self._finalize_graph_output,
"collect_sources": explain.collect_sources, "collect_sources": explain.collect_sources,
"extract_logic": explain.extract_logic, "extract_logic": explain.extract_logic,
"build_sequence": explain.build_sequence,
"build_use_cases": explain.build_use_cases,
"summarize": explain.summarize, "summarize": explain.summarize,
"fetch_source_doc": review.fetch_source_doc, "fetch_source_doc": review.fetch_source_doc,
"normalize_document": review.normalize_document, "normalize_document": review.normalize_document,

View File

@@ -24,6 +24,7 @@ class TaskSpecBuilder:
route: RoutingMeta, route: RoutingMeta,
attachments: list[dict], attachments: list[dict],
files: list[dict], files: list[dict],
rag_items: list[dict],
rag_context: str, rag_context: str,
confluence_context: str, confluence_context: str,
files_map: dict[str, dict], files_map: dict[str, dict],
@@ -32,6 +33,7 @@ class TaskSpecBuilder:
output_contract = self._output_contract(scenario) output_contract = self._output_contract(scenario)
constraints = self._constraints_for(scenario) constraints = self._constraints_for(scenario)
metadata = { metadata = {
"rag_items": rag_items,
"rag_context": rag_context, "rag_context": rag_context,
"confluence_context": confluence_context, "confluence_context": confluence_context,
"files_map": files_map, "files_map": files_map,
@@ -124,14 +126,7 @@ class TaskSpecBuilder:
def _output_contract(self, scenario: Scenario) -> OutputContract: def _output_contract(self, scenario: Scenario) -> OutputContract:
if scenario == Scenario.EXPLAIN_PART: if scenario == Scenario.EXPLAIN_PART:
return OutputContract( return OutputContract(result_type="answer", sections=[OutputSection(name="summary", format="markdown")])
result_type="answer",
sections=[
OutputSection(name="sequence_diagram", format="mermaid"),
OutputSection(name="use_cases", format="markdown"),
OutputSection(name="summary", format="markdown"),
],
)
if scenario == Scenario.ANALYTICS_REVIEW: if scenario == Scenario.ANALYTICS_REVIEW:
return OutputContract( return OutputContract(
result_type="review_report", result_type="review_report",

View File

@@ -43,9 +43,7 @@ class ScenarioTemplateRegistry:
steps = [ steps = [
self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]), self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]),
self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]), self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]),
self._step("build_sequence", "Build sequence", "build_sequence", depends_on=["extract_logic"], outputs=[self._out("sequence_diagram", ArtifactType.TEXT)]), self._step("summarize", "Summarize", "summarize", depends_on=["extract_logic"], outputs=[self._out("final_answer", ArtifactType.TEXT)]),
self._step("build_use_cases", "Build use cases", "build_use_cases", depends_on=["extract_logic"], outputs=[self._out("use_cases", ArtifactType.TEXT)]),
self._step("summarize", "Summarize", "summarize", depends_on=["build_sequence", "build_use_cases"], outputs=[self._out("final_answer", ArtifactType.TEXT)]),
] ]
return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")]) return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")])

View File

@@ -117,6 +117,7 @@ class GraphAgentRuntime:
route=route_meta, route=route_meta,
attachments=attachments, attachments=attachments,
files=files, files=files,
rag_items=rag_ctx,
rag_context=self._format_rag(rag_ctx), rag_context=self._format_rag(rag_ctx),
confluence_context=self._format_confluence(conf_pages), confluence_context=self._format_confluence(conf_pages),
files_map=files_map, files_map=files_map,
@@ -368,7 +369,33 @@ class GraphAgentRuntime:
return pages return pages
def _format_rag(self, items: list[dict]) -> str: def _format_rag(self, items: list[dict]) -> str:
return "\n".join(str(x.get("content", "")) for x in items) blocks: list[str] = []
for item in items:
source = str(item.get("source", "") or item.get("path", "") or "")
layer = str(item.get("layer", "") or "").strip()
title = str(item.get("title", "") or "").strip()
metadata = item.get("metadata", {}) or {}
lines = []
if source:
lines.append(f"Source: {source}")
if layer:
lines.append(f"Layer: {layer}")
if title:
lines.append(f"Title: {title}")
if metadata:
hints = []
for key in ("module_id", "qname", "predicate", "entry_type", "framework", "section_path"):
value = metadata.get(key)
if value:
hints.append(f"{key}={value}")
if hints:
lines.append("Meta: " + ", ".join(hints))
content = str(item.get("content", "")).strip()
if content:
lines.append(content)
if lines:
blocks.append("\n".join(lines))
return "\n\n".join(blocks)
def _format_confluence(self, pages: list[dict]) -> str: def _format_confluence(self, pages: list[dict]) -> str:
return "\n".join(str(x.get("content_markdown", "")) for x in pages) return "\n".join(str(x.get("content_markdown", "")) for x in pages)

View File

@@ -3,7 +3,7 @@ from app.modules.agent.repository import AgentRepository
from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository
from app.modules.chat.repository import ChatRepository from app.modules.chat.repository import ChatRepository
from app.modules.chat.module import ChatModule from app.modules.chat.module import ChatModule
from app.modules.rag_session.repository import RagRepository from app.modules.rag.persistence.repository import RagRepository
from app.modules.rag_session.module import RagModule from app.modules.rag_session.module import RagModule
from app.modules.rag_repo.module import RagRepoModule from app.modules.rag_repo.module import RagRepoModule
from app.modules.shared.bootstrap import bootstrap_database from app.modules.shared.bootstrap import bootstrap_database

300
app/modules/rag/README.md Normal file
View File

@@ -0,0 +1,300 @@
# Модуль rag
## 1. Функции модуля
- Единое ядро RAG для индексации и retrieval по документации и коду проекта.
- Поддержка двух семейств индексации: `DOCS` и `CODE`, с разными слоями и разными pipeline.
- Хранение `rag_session`, index-jobs, многослойных документов, cache-слоев и retrieval-запросов.
- Поддержка индексации snapshot и changes с переиспользованием cache по `blob_sha`.
- Предоставление контекста для agent/chat, где `DOCS` используется по умолчанию, а `CODE` включается для явных вопросов по реализации.
## 2. Диаграмма классов и взаимосвязей
```mermaid
classDiagram
class RagService
class RagRepository
class RagSchemaRepository
class RagDocumentUpserter
class DocsIndexingPipeline
class CodeIndexingPipeline
class RagQueryRouter
class GigaChatEmbedder
RagService --> RagRepository
RagService --> DocsIndexingPipeline
RagService --> CodeIndexingPipeline
RagService --> RagQueryRouter
RagService --> GigaChatEmbedder
RagRepository --> RagSchemaRepository
RagService --> RagDocumentUpserter
```
## 3. Описание классов
- `RagService`: основной application-service модуля.
Методы: `index_snapshot` — индексирует полный набор файлов; `index_changes` — применяет инкрементальные изменения; `retrieve` — возвращает релевантный контекст из `DOCS` или `CODE`.
- `RagRepository`: фасад persistence-слоя RAG.
Методы: `ensure_tables` — создает/обновляет схему; `upsert_session/get_session/session_exists` — операции по `rag_session`; `create_job/update_job/get_job` — операции по index jobs; `replace_documents/apply_document_changes` — операции по документам; `get_cached_documents/cache_documents` — работа с cache; `retrieve/fallback_chunks` — retrieval.
- `RagSchemaRepository`: управление схемой БД для RAG.
Методы: `ensure_tables` — создает таблицы и индексы; `_ensure_columns` — добавляет новые поля; `_ensure_indexes` — поддерживает индексы для retrieval и фильтрации.
- `RagDocumentUpserter`: батчевый writer многослойных `RagDocument`.
Методы: `replace` — полностью заменяет документы сессии; `apply_changes` — применяет upsert/delete по измененным путям.
- `DocsIndexingPipeline`: pipeline индексации документации.
Методы: `supports` — определяет, относится ли файл к docs; `index_file` — строит документы слоев `D1-D4` для одного файла.
- `CodeIndexingPipeline`: pipeline индексации Python-кода.
Методы: `supports` — определяет, относится ли файл к code; `index_file` — строит документы слоев `C0-C3` для одного файла.
- `RagQueryRouter`: выбирает retrieval mode и активные слои.
Методы: `resolve_mode` — определяет `docs` или `code`; `layers_for_mode` — возвращает набор слоев для retrieval.
- `GigaChatEmbedder`: адаптер embeddings-модели.
Методы: `embed` — возвращает embeddings для списка текстов.
## 4. Сиквенс-диаграммы API и выполнения
### Индексация snapshot через текущий `rag_session` facade
Назначение: создать/обновить `rag_session` и построить многослойный индекс по переданным файлам проекта.
```mermaid
sequenceDiagram
participant Router as RagModule.APIRouter
participant Sessions as RagSessionStore
participant Indexing as IndexingOrchestrator
participant Rag as RagService
participant Docs as DocsIndexingPipeline
participant Code as CodeIndexingPipeline
participant Repo as RagRepository
Router->>Sessions: create(project_id)
Sessions-->>Router: rag_session_id
Router->>Indexing: enqueue_snapshot(rag_session_id, files)
Indexing->>Rag: index_snapshot(rag_session_id, files)
loop for each file
Rag->>Docs: supports/index_file
Rag->>Code: supports/index_file
Rag->>Repo: cache_documents(...)
end
Rag->>Repo: replace_documents(...)
Indexing-->>Router: index_job_id,status
```
### Retrieval для agent/chat
Назначение: вернуть релевантный контекст из нужного семейства слоев.
```mermaid
sequenceDiagram
participant Agent as GraphAgentRuntime
participant Rag as RagService
participant Router as RagQueryRouter
participant Repo as RagRepository
Agent->>Rag: retrieve(rag_session_id, query)
Rag->>Router: resolve_mode(query)
Router-->>Rag: docs|code + layers
Rag->>Repo: retrieve(query_embedding, query_text, layers)
Repo-->>Rag: ranked items
Rag-->>Agent: items
```
## 5. Слои, фиксируемые в RAG
### 5.1. Слои DOCS
#### `D1_MODULE_CATALOG`
Назначение: каталог модулей документации и граф связей между ними.
Основные атрибуты:
- `module_id`
- `type`
- `domain`
- `title`
- `status`
- `version`
- `tags`
- `owners`
- `links`
- `calls_api`
- `called_by`
- `uses_logic`
- `used_by`
- `reads_db`
- `writes_db`
- `integrates_with`
- `emits_events`
- `consumes_events`
- `source_path`
- `summary_text`
#### `D2_FACT_INDEX`
Назначение: атомарные факты `subject-predicate-object` с evidence.
Основные атрибуты:
- `fact_id`
- `subject_id`
- `predicate`
- `object`
- `object_ref`
- `source_path`
- `anchor`
- `line_start`
- `line_end`
- `confidence`
- `tags`
#### `D3_SECTION_INDEX`
Назначение: семантические секции документации, нарезанные по заголовкам.
Основные атрибуты:
- `chunk_id`
- `module_id`
- `section_path`
- `section_title`
- `content`
- `source_path`
- `order`
- `tags`
- `domain`
- `type`
- `embedding`
#### `D4_POLICY_INDEX`
Назначение: глобальные правила и конвенции проекта.
Основные атрибуты:
- `policy_id`
- `applies_to`
- `rules`
- `default_behaviors`
- `source_path`
### 5.2. Слои CODE
#### `C0_SOURCE_CHUNKS`
Назначение: сырой код как источник истины для цитирования и evidence.
Основные атрибуты:
- `lang`
- `repo_id`
- `commit_sha`
- `path`
- `span`
- `title`
- `text`
- `module_or_unit`
- `chunk_type`
- `symbol_id`
- `hash`
#### `C1_SYMBOL_CATALOG`
Назначение: каталог символов кода и их деклараций.
Основные атрибуты:
- `lang`
- `repo_id`
- `commit_sha`
- `symbol_id`
- `qname`
- `kind`
- `decl.path`
- `decl.start_line`
- `decl.end_line`
- `text`
- `visibility`
- `signature`
- `decorators_or_annotations`
- `docstring_or_javadoc`
- `parent_symbol_id`
- `package_or_module`
- `is_entry_candidate`
- `lang_payload`
#### `C2_DEPENDENCY_GRAPH`
Назначение: связи между сущностями кода.
Основные атрибуты:
- `lang`
- `repo_id`
- `commit_sha`
- `edge_id`
- `edge_type`
- `src_symbol_id`
- `dst_symbol_id`
- `dst_ref`
- `evidence.path`
- `evidence.start_line`
- `evidence.end_line`
- `text`
- `resolution`
- `callsite_kind`
- `lang_payload`
#### `C3_ENTRYPOINTS`
Назначение: точки входа приложения и их обработчики.
Основные атрибуты:
- `lang`
- `repo_id`
- `commit_sha`
- `entry_id`
- `entry_type`
- `framework`
- `route_or_command`
- `handler_symbol_id`
- `evidence.path`
- `evidence.start_line`
- `evidence.end_line`
- `text`
- `http.methods`
- `http.auth`
- `request_model`
- `response_model`
- `cli.args_schema`
- `task.queue`
- `task.cron`
- `tags`
- `lang_payload`
#### `C4_PUBLIC_API`
Назначение: публичная поверхность API/экспортируемых символов.
Основные атрибуты:
- `api_id`
- `symbol_id`
- `stability`
- `source_of_truth`
- `versioning_tags`
- `lang_payload`
#### `C5_BEHAVIOR_SUMMARIES`
Назначение: поведенческие summary с обязательными evidence links.
Основные атрибуты:
- `target_type`
- `target_id`
- `text`
- `claims`
- `evidence_links`
- `confidence`
- `generated_by`
- `generated_at`
#### `C6_RUNTIME_TRACES`
Назначение: runtime/trace слой для связи кода и реального исполнения.
Основные атрибуты:
- `env`
- `trace_id`
- `span_id`
- `symbol_id`
- `entry_id`
- `text`
- `timings`
- `service`
- `host`
- `labels`
## 6. Правила retrieval
- По умолчанию retrieval идет в `DOCS`.
- `CODE` используется только для явных вопросов по реализации, устройству кода, endpoint'ам, handler'ам и документации “из кода”.
- Для `DOCS` приоритет слоев: `D1 -> D2 -> D3 -> D4`.
- Для `CODE` приоритет слоев: `C3 -> C1 -> C2 -> C0`.
## 7. Текущий статус реализации
- В первой итерации реализованы `DOCS D1-D4`.
- В первой итерации реализованы `CODE C0-C3`.
- `C4-C6` зафиксированы в контракте и зарезервированы под следующие этапы.
- Текущие `rag_session` и `rag_repo` работают как facade/adapter поверх нового пакета `rag`.

View File

@@ -0,0 +1,25 @@
from app.modules.rag.contracts import (
DocKind,
EvidenceLink,
EvidenceType,
RagDocument,
RagLayer,
RagSource,
RagSpan,
RetrievalItem,
RetrievalMode,
RetrievalQuery,
)
__all__ = [
"DocKind",
"EvidenceLink",
"EvidenceType",
"RagDocument",
"RagLayer",
"RagSource",
"RagSpan",
"RetrievalItem",
"RetrievalMode",
"RetrievalQuery",
]

Binary file not shown.

View File

@@ -0,0 +1,17 @@
from app.modules.rag.contracts.documents import RagDocument, RagSource, RagSpan
from app.modules.rag.contracts.enums import DocKind, EvidenceType, RagLayer, RetrievalMode
from app.modules.rag.contracts.evidence import EvidenceLink
from app.modules.rag.contracts.retrieval import RetrievalItem, RetrievalQuery
__all__ = [
"DocKind",
"EvidenceLink",
"EvidenceType",
"RagDocument",
"RagLayer",
"RagSource",
"RagSpan",
"RetrievalItem",
"RetrievalMode",
"RetrievalQuery",
]

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from hashlib import sha256
from app.modules.rag.contracts.evidence import EvidenceLink
@dataclass(slots=True)
class RagSource:
repo_id: str
commit_sha: str | None
path: str
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagSpan:
start_line: int | None = None
end_line: int | None = None
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagDocument:
layer: str
source: RagSource
title: str
text: str
metadata: dict = field(default_factory=dict)
links: list[EvidenceLink] = field(default_factory=list)
span: RagSpan | None = None
doc_id: str | None = None
lang: str | None = None
embedding: list[float] | None = None
def ensure_doc_id(self) -> str:
if self.doc_id:
return self.doc_id
span_key = ""
if self.span is not None:
span_key = f":{self.span.start_line}:{self.span.end_line}"
raw = "|".join(
[
self.layer,
self.lang or "",
self.source.repo_id,
self.source.commit_sha or "",
self.source.path,
self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""),
self.title,
span_key,
]
)
self.doc_id = sha256(raw.encode("utf-8")).hexdigest()
return self.doc_id
def to_record(self) -> dict:
return {
"doc_id": self.ensure_doc_id(),
"layer": self.layer,
"lang": self.lang,
"repo_id": self.source.repo_id,
"commit_sha": self.source.commit_sha,
"path": self.source.path,
"title": self.title,
"text": self.text,
"metadata": dict(self.metadata),
"links": [link.to_dict() for link in self.links],
"span_start": self.span.start_line if self.span else None,
"span_end": self.span.end_line if self.span else None,
"embedding": self.embedding or [],
}

View File

@@ -0,0 +1,35 @@
from __future__ import annotations
class RagLayer:
DOCS_MODULE_CATALOG = "D1_MODULE_CATALOG"
DOCS_FACT_INDEX = "D2_FACT_INDEX"
DOCS_SECTION_INDEX = "D3_SECTION_INDEX"
DOCS_POLICY_INDEX = "D4_POLICY_INDEX"
CODE_SOURCE_CHUNKS = "C0_SOURCE_CHUNKS"
CODE_SYMBOL_CATALOG = "C1_SYMBOL_CATALOG"
CODE_DEPENDENCY_GRAPH = "C2_DEPENDENCY_GRAPH"
CODE_ENTRYPOINTS = "C3_ENTRYPOINTS"
CODE_PUBLIC_API = "C4_PUBLIC_API"
CODE_BEHAVIOR_SUMMARIES = "C5_BEHAVIOR_SUMMARIES"
CODE_RUNTIME_TRACES = "C6_RUNTIME_TRACES"
class RetrievalMode:
DOCS = "docs"
CODE = "code"
class DocKind:
SPEC = "spec"
RUNBOOK = "runbook"
README = "readme"
MISC = "misc"
class EvidenceType:
CODE_SPAN = "code_span"
SYMBOL = "symbol"
EDGE = "edge"
DOC_SECTION = "doc_section"
DOC_FACT = "doc_fact"

View File

@@ -0,0 +1,16 @@
from __future__ import annotations
from dataclasses import asdict, dataclass
@dataclass(slots=True)
class EvidenceLink:
type: str
target_id: str
path: str | None = None
start_line: int | None = None
end_line: int | None = None
note: str | None = None
def to_dict(self) -> dict:
return asdict(self)

View File

@@ -0,0 +1,23 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class RetrievalQuery:
text: str
mode: str
limit: int = 5
layers: list[str] = field(default_factory=list)
path_prefixes: list[str] = field(default_factory=list)
doc_kind: str | None = None
@dataclass(slots=True)
class RetrievalItem:
content: str
path: str
layer: str
title: str
score: float | None = None
metadata: dict | None = None

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import ast
from dataclasses import dataclass
@dataclass(slots=True)
class CodeChunk:
title: str
text: str
start_line: int
end_line: int
chunk_type: str
class CodeTextChunker:
def chunk(self, path: str, text: str) -> list[CodeChunk]:
try:
tree = ast.parse(text)
except SyntaxError:
return self._window_chunks(path, text)
chunks: list[CodeChunk] = []
lines = text.splitlines()
for node in tree.body:
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
continue
start = int(getattr(node, "lineno", 1))
end = int(getattr(node, "end_lineno", start))
body = "\n".join(lines[start - 1 : end]).strip()
if not body:
continue
chunks.append(
CodeChunk(
title=f"{path}:{getattr(node, 'name', 'block')}",
text=body,
start_line=start,
end_line=end,
chunk_type="symbol_block",
)
)
return chunks or self._window_chunks(path, text)
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
lines = text.splitlines()
chunks: list[CodeChunk] = []
size = 80
overlap = 15
start = 0
while start < len(lines):
end = min(len(lines), start + size)
body = "\n".join(lines[start:end]).strip()
if body:
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
if end >= len(lines):
break
start = max(0, end - overlap)
return chunks

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
class CodeTextDocumentBuilder:
def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_SOURCE_CHUNKS,
lang="python",
source=source,
title=chunk.title,
text=chunk.text,
span=RagSpan(chunk.start_line, chunk.end_line),
metadata={
"chunk_index": chunk_index,
"chunk_type": chunk.chunk_type,
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.edges.extractor import PyEdge
class EdgeDocumentBuilder:
def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
return RagDocument(
layer=RagLayer.CODE_DEPENDENCY_GRAPH,
lang="python",
source=source,
title=f"{edge.src_qname}:{edge.edge_type}",
text=f"{edge.src_qname} {edge.edge_type} {dst}",
span=RagSpan(edge.start_line, edge.end_line),
metadata={
"edge_id": edge.edge_id,
"edge_type": edge.edge_type,
"src_symbol_id": edge.src_symbol_id,
"src_qname": edge.src_qname,
"dst_symbol_id": edge.dst_symbol_id,
"dst_ref": edge.dst_ref,
"resolution": edge.resolution,
"lang_payload": edge.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
)

View File

@@ -0,0 +1,114 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PyEdge:
edge_id: str
edge_type: str
src_symbol_id: str
src_qname: str
dst_symbol_id: str | None
dst_ref: str | None
path: str
start_line: int
end_line: int
resolution: str = "partial"
metadata: dict = field(default_factory=dict)
class EdgeExtractor:
def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
if ast_tree is None:
return []
qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
visitor = _EdgeVisitor(path, qname_map)
visitor.visit(ast_tree)
return visitor.edges
class _EdgeVisitor(ast.NodeVisitor):
def __init__(self, path: str, qname_map: dict[str, str]) -> None:
self._path = path
self._qname_map = qname_map
self._scope: list[str] = []
self.edges: list[PyEdge] = []
def visit_ClassDef(self, node: ast.ClassDef) -> None:
current = self._enter(node.name)
for base in node.bases:
self._add_edge("inherits", current, self._name(base), base)
self.generic_visit(node)
self._scope.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._visit_function(node)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._visit_function(node)
def visit_Import(self, node: ast.Import) -> None:
current = self._current_qname()
if not current:
return
for item in node.names:
self._add_edge("imports", current, item.name, node)
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
current = self._current_qname()
if not current:
return
module = node.module or ""
for item in node.names:
self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
def _visit_function(self, node) -> None:
current = self._enter(node.name)
for inner in ast.walk(node):
if isinstance(inner, ast.Call):
self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
self.generic_visit(node)
self._scope.pop()
def _enter(self, name: str) -> str:
self._scope.append(name)
return self._current_qname() or name
def _current_qname(self) -> str | None:
if not self._scope:
return None
return ".".join(self._scope)
def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
if not dst_ref:
return
src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
dst_symbol_id = self._qname_map.get(dst_ref)
edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
self.edges.append(
PyEdge(
edge_id=edge_id,
edge_type=edge_type,
src_symbol_id=src_symbol_id,
src_qname=src_qname,
dst_symbol_id=dst_symbol_id,
dst_ref=dst_ref,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
resolution="resolved" if dst_symbol_id else "partial",
metadata=extra or {},
)
)
def _name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._name(node.func)
return ""

View File

@@ -0,0 +1,26 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class EntrypointDocumentBuilder:
def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_ENTRYPOINTS,
lang="python",
source=source,
title=entrypoint.route_or_command,
text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
span=RagSpan(entrypoint.start_line, entrypoint.end_line),
metadata={
"entry_id": entrypoint.entry_id,
"entry_type": entrypoint.entry_type,
"framework": entrypoint.framework,
"route_or_command": entrypoint.route_or_command,
"handler_symbol_id": entrypoint.handler_symbol_id,
"lang_payload": entrypoint.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
)

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FastApiEntrypointDetector:
_METHODS = {"get", "post", "put", "patch", "delete"}
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
decorators = symbol.decorators or []
for decorator in decorators:
name = decorator.lower()
tail = name.split(".")[-1]
if tail not in self._METHODS and ".route" not in name:
continue
route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
items.append(
Entrypoint(
entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="fastapi",
route_or_command=route,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
)
)
return items

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FlaskEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".route" not in lowered:
continue
items.append(
Entrypoint(
entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="flask",
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,27 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class Entrypoint:
entry_id: str
entry_type: str
framework: str
route_or_command: str
handler_symbol_id: str
path: str
start_line: int
end_line: int
metadata: dict = field(default_factory=dict)
class EntrypointDetectorRegistry:
def __init__(self, detectors: list) -> None:
self._detectors = detectors
def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for detector in self._detectors:
items.extend(detector.detect(path=path, symbols=symbols))
return items

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class TyperClickEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".command" not in lowered and ".callback" not in lowered:
continue
framework = "typer" if "typer" in lowered else "click"
items.append(
Entrypoint(
entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="cli",
framework=framework,
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
from pathlib import PurePosixPath
class PythonFileFilter:
_EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if candidate.suffix.lower() != ".py":
return False
return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)

View File

@@ -0,0 +1,52 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
class CodeIndexingPipeline:
def __init__(self) -> None:
self._filter = PythonFileFilter()
self._chunker = CodeTextChunker()
self._code_builder = CodeTextDocumentBuilder()
self._parser = PythonAstParser()
self._symbols = SymbolExtractor()
self._symbol_builder = SymbolDocumentBuilder()
self._edges = EdgeExtractor()
self._edge_builder = EdgeDocumentBuilder()
self._entrypoints = EntrypointDetectorRegistry(
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
)
self._entrypoint_builder = EntrypointDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
docs: list[RagDocument] = []
code_chunks = self._chunker.chunk(path, content)
for index, chunk in enumerate(code_chunks):
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
tree = self._parser.parse_module(content)
symbols = self._symbols.extract(path, content, tree)
for symbol in symbols:
docs.append(self._symbol_builder.build(source, symbol))
edges = self._edges.extract(path, tree, symbols)
for edge in edges:
docs.append(self._edge_builder.build(source, edge))
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
docs.append(self._entrypoint_builder.build(source, entrypoint))
return docs

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
import ast
class PythonAstParser:
def parse_module(self, text: str) -> ast.AST | None:
try:
return ast.parse(text)
except SyntaxError:
return None

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
class SymbolDocumentBuilder:
def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
if symbol.docstring:
body.append(symbol.docstring.strip())
return RagDocument(
layer=RagLayer.CODE_SYMBOL_CATALOG,
lang="python",
source=source,
title=symbol.qname,
text="\n".join(part for part in body if part),
span=RagSpan(symbol.start_line, symbol.end_line),
metadata={
"symbol_id": symbol.symbol_id,
"qname": symbol.qname,
"kind": symbol.kind,
"signature": symbol.signature,
"decorators_or_annotations": symbol.decorators,
"docstring_or_javadoc": symbol.docstring,
"parent_symbol_id": symbol.parent_symbol_id,
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
"is_entry_candidate": bool(symbol.decorators),
"lang_payload": symbol.lang_payload,
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,130 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PySymbol:
symbol_id: str
qname: str
kind: str
path: str
start_line: int
end_line: int
signature: str
decorators: list[str] = field(default_factory=list)
docstring: str | None = None
parent_symbol_id: str | None = None
lang_payload: dict = field(default_factory=dict)
class SymbolExtractor:
def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
if ast_tree is None:
return []
collector = _SymbolVisitor(path)
collector.visit(ast_tree)
return collector.symbols
class _SymbolVisitor(ast.NodeVisitor):
def __init__(self, path: str) -> None:
self._path = path
self._stack: list[tuple[str, str]] = []
self.symbols: list[PySymbol] = []
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
if self._stack:
return
module = node.module or ""
for item in node.names:
local_name = item.asname or item.name
imported_name = f"{module}.{item.name}".strip(".")
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"{local_name} = {imported_name}",
lang_payload={"imported_from": imported_name, "import_alias": True},
)
)
self.generic_visit(node)
def visit_Import(self, node: ast.Import) -> None:
if self._stack:
return
for item in node.names:
local_name = item.asname or item.name
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"import {item.name}",
lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
)
)
self.generic_visit(node)
def visit_ClassDef(self, node: ast.ClassDef) -> None:
self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
self.generic_visit(node)
self._stack.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._add_function(node, is_async=False)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._add_function(node, is_async=True)
def _add_function(self, node, *, is_async: bool) -> None:
kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
self._add_symbol(node, kind, {"async": is_async})
self.generic_visit(node)
self._stack.pop()
def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
names = [name for _, name in self._stack] + [node.name]
qname = ".".join(names)
symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
signature = self._signature(node)
symbol = PySymbol(
symbol_id=symbol_id,
qname=qname,
kind=kind,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=signature,
decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
docstring=ast.get_docstring(node),
parent_symbol_id=self._stack[-1][1] if self._stack else None,
lang_payload=lang_payload,
)
self.symbols.append(symbol)
self._stack.append((kind, qname))
def _signature(self, node) -> str:
if isinstance(node, ast.ClassDef):
bases = ", ".join(self._expr_name(base) for base in node.bases)
return f"{node.name}({bases})" if bases else node.name
args = [arg.arg for arg in getattr(node.args, "args", [])]
return f"{node.name}({', '.join(args)})"
def _expr_name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._expr_name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._expr_name(node.func)
return ast.dump(node, include_attributes=False)

View File

@@ -0,0 +1,15 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument
from app.modules.rag.persistence.repository import RagRepository
class RagDocumentUpserter:
def __init__(self, repository: RagRepository) -> None:
self._repository = repository
def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None:
self._repository.replace_documents(rag_session_id, docs)
def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
self._repository.apply_document_changes(rag_session_id, delete_paths, docs)

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class IndexReport:
indexed_files: int = 0
failed_files: int = 0
cache_hit_files: int = 0
cache_miss_files: int = 0
documents: int = 0
warnings: list[str] = field(default_factory=list)
def as_tuple(self) -> tuple[int, int, int, int]:
return (
self.indexed_files,
self.failed_files,
self.cache_hit_files,
self.cache_miss_files,
)

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import dataclass
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
@dataclass(slots=True)
class SectionChunk:
section_path: str
section_title: str
content: str
order: int
class MarkdownDocChunker:
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
self._fallback = text_chunker or DocTextChunker()
def chunk(self, text: str) -> list[SectionChunk]:
lines = text.splitlines()
sections: list[SectionChunk] = []
stack: list[tuple[int, str]] = []
current_title = "Document"
current_lines: list[str] = []
order = 0
for line in lines:
heading = self._heading(line)
if heading is None:
current_lines.append(line)
continue
self._flush_section(sections, stack, current_title, current_lines, order)
order += 1
level, title = heading
stack = [item for item in stack if item[0] < level]
stack.append((level, title))
current_title = title
current_lines = []
self._flush_section(sections, stack, current_title, current_lines, order)
if sections:
return sections
chunks = self._fallback.split(text)
return [
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
for index, chunk in enumerate(chunks)
]
def _flush_section(
self,
sections: list[SectionChunk],
stack: list[tuple[int, str]],
current_title: str,
current_lines: list[str],
order: int,
) -> None:
content = "\n".join(current_lines).strip()
if not content:
return
titles = [title for _, title in stack] or [current_title]
sections.append(
SectionChunk(
section_path=" > ".join(titles),
section_title=titles[-1],
content=content,
order=order,
)
)
def _heading(self, line: str) -> tuple[int, str] | None:
stripped = line.strip()
if not stripped.startswith("#"):
return None
level = len(stripped) - len(stripped.lstrip("#"))
title = stripped[level:].strip()
if not title:
return None
return level, title

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
class DocTextChunker:
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
self._max_chars = max_chars
self._overlap_chars = overlap_chars
def split(self, text: str) -> list[str]:
cleaned = text.strip()
if not cleaned:
return []
chunks: list[str] = []
start = 0
while start < len(cleaned):
end = min(len(cleaned), start + self._max_chars)
chunks.append(cleaned[start:end].strip())
if end >= len(cleaned):
break
start = max(0, end - self._overlap_chars)
return [chunk for chunk in chunks if chunk]

View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from pathlib import PurePosixPath
from app.modules.rag.contracts import DocKind
class DocsClassifier:
def classify(self, path: str) -> str:
upper = PurePosixPath(path).name.upper()
lowered = path.lower()
if "runbook" in lowered or upper.startswith("RUNBOOK"):
return DocKind.RUNBOOK
if upper.startswith("README"):
return DocKind.README
if "spec" in lowered or "architecture" in lowered:
return DocKind.SPEC
return DocKind.MISC

View File

@@ -0,0 +1,115 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
class DocsDocumentBuilder:
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
module_id = str(frontmatter.get("id") or "").strip()
module_type = str(frontmatter.get("type") or "").strip()
domain = str(frontmatter.get("domain") or "").strip()
if not module_id or not module_type or not domain:
return None
links = frontmatter.get("links") or {}
metadata = {
"module_id": module_id,
"type": module_type,
"domain": domain,
"status": frontmatter.get("status"),
"version": frontmatter.get("version"),
"tags": frontmatter.get("tags") or [],
"owners": frontmatter.get("owners") or [],
"links": links,
"source_path": source.path,
"summary_text": summary_text[:4000],
"doc_kind": doc_kind,
}
metadata.update({name: links.get(name, []) for name in (
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
"integrates_with", "emits_events", "consumes_events",
)})
return RagDocument(
layer=RagLayer.DOCS_MODULE_CATALOG,
source=source,
title=module_id,
text=summary_text[:4000] or module_id,
metadata=metadata,
)
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
module_id = str(frontmatter.get("id") or source.path)
metadata = {
"module_id": module_id,
"type": frontmatter.get("type"),
"domain": frontmatter.get("domain"),
"tags": frontmatter.get("tags") or [],
"section_path": chunk.section_path,
"section_title": chunk.section_title,
"order": chunk.order,
"doc_kind": doc_kind,
"source_path": source.path,
"artifact_type": "DOCS",
}
return RagDocument(
layer=RagLayer.DOCS_SECTION_INDEX,
source=source,
title=f"{module_id}:{chunk.section_title}",
text=chunk.content,
metadata=metadata,
)
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
policy_id = str(frontmatter.get("id") or "").strip()
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
if not policy_id:
return None
metadata = {
"policy_id": policy_id,
"applies_to": applies_to,
"rules": chunk.content[:4000],
"default_behaviors": frontmatter.get("default_behaviors") or [],
"doc_kind": doc_kind,
"section_path": chunk.section_path,
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_POLICY_INDEX,
source=source,
title=policy_id,
text=chunk.content[:4000],
metadata=metadata,
)
def build_fact(
self,
source: RagSource,
*,
subject_id: str,
predicate: str,
obj: str,
object_ref: str | None,
anchor: str,
tags: list[str] | None = None,
) -> RagDocument:
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
metadata = {
"fact_id": fact_id,
"subject_id": subject_id,
"predicate": predicate,
"object": obj,
"object_ref": object_ref,
"anchor": anchor,
"tags": tags or [],
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_FACT_INDEX,
source=source,
title=f"{subject_id}:{predicate}",
text=f"{subject_id} {predicate} {obj}".strip(),
metadata=metadata,
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
)

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from pathlib import PurePosixPath
class DocsFileFilter:
_EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
_NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
_EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
return False
if candidate.suffix.lower() in self._EXTENSIONS:
return True
upper_name = candidate.name.upper()
if any(upper_name.startswith(prefix) for prefix in self._NAMES):
return True
joined = "/".join(candidate.parts).lower()
return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))

View File

@@ -0,0 +1,135 @@
from __future__ import annotations
import re
import yaml
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.modules.rag.indexing.docs.classifier import DocsClassifier
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
class DocsIndexingPipeline:
def __init__(self) -> None:
self._filter = DocsFileFilter()
self._classifier = DocsClassifier()
self._chunker = MarkdownDocChunker()
self._builder = DocsDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
frontmatter, body = self._split_frontmatter(content)
doc_kind = self._classifier.classify(path)
sections = self._chunker.chunk(body)
summary_text = self._summary_from_sections(sections)
docs: list[RagDocument] = []
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
if module_doc is not None:
docs.append(module_doc)
for section in sections:
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
if str(frontmatter.get("type") or "").strip() == "policy":
for section in sections[:1]:
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
if policy is not None:
docs.append(policy)
docs.extend(self._extract_facts(source, frontmatter, sections))
return docs
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
if not content.startswith("---\n"):
return {}, content
_, raw, body = content.split("---", 2)
payload = yaml.safe_load(raw) or {}
return payload if isinstance(payload, dict) else {}, body.strip()
def _summary_from_sections(self, sections) -> str:
text = "\n\n".join(section.content for section in sections[:2]).strip()
return text[:4000]
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
subject_id = str(frontmatter.get("id") or source.path)
docs: list[RagDocument] = []
links = frontmatter.get("links") or {}
for predicate, values in links.items():
for value in values or []:
docs.append(
self._builder.build_fact(
source,
subject_id=subject_id,
predicate=predicate,
obj=str(value),
object_ref=str(value),
anchor="frontmatter.links",
)
)
for section in sections:
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
return docs
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
lines = [line.strip() for line in content.splitlines() if line.strip()]
if len(lines) < 3 or "|" not in lines[0]:
return []
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
if not all(headers):
return []
docs: list[RagDocument] = []
for row in lines[2:]:
if "|" not in row:
continue
values = [part.strip() for part in row.strip("|").split("|")]
if len(values) != len(headers):
continue
payload = dict(zip(headers, values))
docs.extend(self._facts_from_row(source, subject_id, title, payload))
return docs
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
docs: list[RagDocument] = []
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
if "request" in title.lower() or "response" in title.lower():
if name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
if payload.get("required"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
if payload.get("type"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
if payload.get("validation"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
if "error" in title.lower():
if payload.get("status"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
if payload.get("error") or payload.get("code"):
error_value = payload.get("error") or payload.get("code")
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
if payload.get("client action"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
if "constraint" in title.lower() and name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
return docs
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
docs: list[RagDocument] = []
for line in content.splitlines():
item = line.strip()
if not item.startswith(("-", "*", "1.", "2.", "3.")):
continue
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
lowered = normalized.lower()
if lowered.startswith("metric:"):
predicate = "emits_metric"
elif lowered.startswith("event:"):
predicate = "emits_analytics_event"
elif lowered.startswith("log:"):
predicate = "logs_event"
else:
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
return docs

View File

@@ -0,0 +1,189 @@
from __future__ import annotations
import json
from sqlalchemy import text
from app.modules.rag.contracts import EvidenceLink, RagDocument, RagSource, RagSpan
from app.modules.rag.contracts.enums import RagLayer
class RagCacheRepository:
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
with self._engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end,
repo_id, commit_sha, embedding::text AS embedding_txt
FROM rag_chunk_cache
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
ORDER BY chunk_index ASC
"""
),
{"repo_id": repo_id, "blob_sha": blob_sha},
).mappings().fetchall()
docs: list[RagDocument] = []
for row in rows:
metadata = self._loads(row.get("metadata_json"))
docs.append(
RagDocument(
layer=str(row["layer"]),
lang=row.get("lang"),
source=RagSource(
repo_id=str(row["repo_id"]),
commit_sha=row.get("commit_sha"),
path=str(row["path"]),
),
title=str(row["title"] or row["path"]),
text=str(row["content"] or ""),
metadata=metadata,
links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])],
span=RagSpan(row.get("span_start"), row.get("span_end")),
embedding=self._parse_vector(str(row["embedding_txt"] or "")),
)
)
return docs
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
if not docs:
return
with self._engine().connect() as conn:
first = docs[0].to_record()
first_meta = first["metadata"]
conn.execute(
text(
"""
INSERT INTO rag_blob_cache (
repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner,
system_component, last_modified, staleness_score, layer, lang, metadata_json
)
VALUES (
:repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner,
:system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json
)
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
artifact_type = EXCLUDED.artifact_type,
section = EXCLUDED.section,
doc_id = EXCLUDED.doc_id,
doc_version = EXCLUDED.doc_version,
owner = EXCLUDED.owner,
system_component = EXCLUDED.system_component,
last_modified = EXCLUDED.last_modified,
staleness_score = EXCLUDED.staleness_score,
layer = EXCLUDED.layer,
lang = EXCLUDED.lang,
metadata_json = EXCLUDED.metadata_json,
updated_at = CURRENT_TIMESTAMP
"""
),
{
"repo_id": repo_id,
"blob_sha": blob_sha,
"path": path,
"artifact_type": first_meta.get("artifact_type"),
"section": first_meta.get("section") or first_meta.get("section_title"),
"doc_id": first_meta.get("doc_id"),
"doc_version": first_meta.get("doc_version"),
"owner": first_meta.get("owner"),
"system_component": first_meta.get("system_component"),
"last_modified": first_meta.get("last_modified"),
"staleness_score": first_meta.get("staleness_score"),
"layer": first["layer"],
"lang": first["lang"],
"metadata_json": json.dumps(first_meta, ensure_ascii=True),
},
)
conn.execute(
text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"),
{"repo_id": repo_id, "blob_sha": blob_sha},
)
for idx, doc in enumerate(docs):
row = doc.to_record()
metadata = row["metadata"]
emb = row["embedding"] or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
text(
"""
INSERT INTO rag_chunk_cache (
repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title,
metadata_json, links_json, span_start, span_end, commit_sha
)
VALUES (
:repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer,
:lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha
)
"""
),
{
"repo_id": repo_id,
"blob_sha": blob_sha,
"chunk_index": idx,
"content": row["text"],
"embedding": emb_str,
"section": metadata.get("section") or metadata.get("section_title"),
"layer": row["layer"],
"lang": row["lang"],
"path": row["path"],
"title": row["title"],
"metadata_json": json.dumps(metadata, ensure_ascii=True),
"links_json": json.dumps(row["links"], ensure_ascii=True),
"span_start": row["span_start"],
"span_end": row["span_end"],
"commit_sha": row["commit_sha"],
},
)
conn.commit()
def record_repo_cache(
self,
*,
project_id: str,
commit_sha: str | None,
changed_files: list[str],
summary: str,
) -> None:
docs: list[RagDocument] = []
for idx, path in enumerate(changed_files):
docs.append(
RagDocument(
layer=RagLayer.CODE_SOURCE_CHUNKS,
lang="python" if path.endswith(".py") else None,
source=RagSource(project_id, commit_sha, path),
title=path,
text=f"repo_webhook:{path}:{summary[:300]}",
metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"},
)
)
for doc in docs:
blob_sha = self._blob_sha(commit_sha, doc.source.path)
doc.metadata["blob_sha"] = blob_sha
self.cache_documents(project_id, doc.source.path, blob_sha, [doc])
def _blob_sha(self, commit_sha: str | None, path: str) -> str:
from hashlib import sha256
return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest()
def _engine(self):
from app.modules.shared.db import get_engine
return get_engine()
def _loads(self, value, default=None):
if default is None:
default = {}
if not value:
return default
return json.loads(str(value))
def _parse_vector(self, value: str) -> list[float]:
text_value = value.strip()
if not text_value:
return []
if text_value.startswith("[") and text_value.endswith("]"):
text_value = text_value[1:-1]
if not text_value:
return []
return [float(part.strip()) for part in text_value.split(",") if part.strip()]

View File

@@ -0,0 +1,122 @@
from __future__ import annotations
import json
from sqlalchemy import text
from app.modules.rag.contracts import RagDocument
class RagDocumentRepository:
def replace_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
self.insert_documents(conn, rag_session_id, docs)
def apply_document_changes(
self,
conn,
rag_session_id: str,
delete_paths: list[str],
docs: list[RagDocument],
) -> None:
if delete_paths:
conn.execute(
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": delete_paths},
)
conn.execute(
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": delete_paths},
)
if not docs:
return
paths = sorted({doc.source.path for doc in docs})
conn.execute(
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": paths},
)
conn.execute(
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": paths},
)
self.insert_documents(conn, rag_session_id, docs)
def insert_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None:
for doc in docs:
row = doc.to_record()
metadata = row["metadata"]
links = row["links"]
emb = row["embedding"] or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
text(
"""
INSERT INTO rag_chunks (
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
)
VALUES (
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
:module_id, :section_path, :doc_kind
)
"""
),
{
"sid": rag_session_id,
"path": row["path"],
"chunk_index": int(metadata.get("chunk_index", 0)),
"content": row["text"],
"emb": emb_str,
"artifact_type": metadata.get("artifact_type"),
"section": metadata.get("section") or metadata.get("section_title"),
"doc_id": metadata.get("doc_id"),
"doc_version": metadata.get("doc_version"),
"owner": metadata.get("owner"),
"system_component": metadata.get("system_component"),
"last_modified": metadata.get("last_modified"),
"staleness_score": metadata.get("staleness_score"),
"rag_doc_id": row["doc_id"],
"layer": row["layer"],
"lang": row["lang"],
"repo_id": row["repo_id"],
"commit_sha": row["commit_sha"],
"title": row["title"],
"metadata_json": json.dumps(metadata, ensure_ascii=True),
"links_json": json.dumps(links, ensure_ascii=True),
"span_start": row["span_start"],
"span_end": row["span_end"],
"symbol_id": metadata.get("symbol_id"),
"qname": metadata.get("qname"),
"kind": metadata.get("kind") or metadata.get("type"),
"framework": metadata.get("framework"),
"entrypoint_type": metadata.get("entry_type") or metadata.get("entrypoint_type"),
"module_id": metadata.get("module_id") or metadata.get("policy_id"),
"section_path": metadata.get("section_path"),
"doc_kind": metadata.get("doc_kind"),
},
)
repo_id = str(row["repo_id"] or "").strip()
blob_sha = str(metadata.get("blob_sha") or "").strip()
if repo_id and blob_sha:
conn.execute(
text(
"""
INSERT INTO rag_session_chunk_map (
rag_session_id, repo_id, blob_sha, chunk_index, path
) VALUES (:sid, :repo_id, :blob_sha, :chunk_index, :path)
"""
),
{
"sid": rag_session_id,
"repo_id": repo_id,
"blob_sha": blob_sha,
"chunk_index": int(metadata.get("chunk_index", 0)),
"path": row["path"],
},
)

View File

@@ -0,0 +1,95 @@
from __future__ import annotations
from dataclasses import dataclass
from sqlalchemy import text
from app.modules.shared.db import get_engine
@dataclass
class RagJobRow:
index_job_id: str
rag_session_id: str
status: str
indexed_files: int
failed_files: int
cache_hit_files: int
cache_miss_files: int
error_code: str | None
error_desc: str | None
error_module: str | None
class RagJobRepository:
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
VALUES (:jid, :sid, :status)
"""
),
{"jid": index_job_id, "sid": rag_session_id, "status": status},
)
conn.commit()
def update_job(
self,
index_job_id: str,
*,
status: str,
indexed_files: int,
failed_files: int,
cache_hit_files: int = 0,
cache_miss_files: int = 0,
error_code: str | None = None,
error_desc: str | None = None,
error_module: str | None = None,
) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
UPDATE rag_index_jobs
SET status = :status,
indexed_files = :indexed,
failed_files = :failed,
cache_hit_files = :cache_hit_files,
cache_miss_files = :cache_miss_files,
error_code = :ecode,
error_desc = :edesc,
error_module = :emodule,
updated_at = CURRENT_TIMESTAMP
WHERE index_job_id = :jid
"""
),
{
"jid": index_job_id,
"status": status,
"indexed": indexed_files,
"failed": failed_files,
"cache_hit_files": cache_hit_files,
"cache_miss_files": cache_miss_files,
"ecode": error_code,
"edesc": error_desc,
"emodule": error_module,
},
)
conn.commit()
def get_job(self, index_job_id: str) -> RagJobRow | None:
with get_engine().connect() as conn:
row = conn.execute(
text(
"""
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
FROM rag_index_jobs
WHERE index_job_id = :jid
"""
),
{"jid": index_job_id},
).mappings().fetchone()
return RagJobRow(**dict(row)) if row else None

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
import json
from sqlalchemy import text
from app.modules.rag.retrieval.query_terms import extract_query_terms
from app.modules.shared.db import get_engine
class RagQueryRepository:
def retrieve(
self,
rag_session_id: str,
query_embedding: list[float],
*,
query_text: str = "",
limit: int = 5,
layers: list[str] | None = None,
path_prefixes: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
if path_prefixes:
or_filters = []
for idx, prefix in enumerate(path_prefixes):
key = f"path_{idx}"
params[key] = f"{prefix}%"
or_filters.append(f"path LIKE :{key}")
filters.append("(" + " OR ".join(or_filters) + ")")
term_filters = []
terms = extract_query_terms(query_text)
for idx, term in enumerate(terms):
exact_key = f"term_exact_{idx}"
prefix_key = f"term_prefix_{idx}"
contains_key = f"term_contains_{idx}"
params[exact_key] = term
params[prefix_key] = f"{term}%"
params[contains_key] = f"%{term}%"
term_filters.append(
"CASE "
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
"ELSE 100 END"
)
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
test_penalty_sql = (
"CASE "
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
"THEN 1 ELSE 0 END"
if prefer_non_tests
else "0"
)
layer_rank_sql = (
"CASE "
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
"ELSE 10 END"
)
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
{lexical_sql} AS lexical_rank,
{test_penalty_sql} AS test_penalty,
{layer_rank_sql} AS layer_rank,
(embedding <=> CAST(:emb AS vector)) AS distance
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
LIMIT :lim
"""
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY id DESC
LIMIT :lim
"""
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def _row_to_dict(self, row) -> dict:
data = dict(row)
data["metadata"] = json.loads(str(data.pop("metadata_json") or "{}"))
return data

View File

@@ -0,0 +1,82 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument
from app.modules.rag.persistence.cache_repository import RagCacheRepository
from app.modules.rag.persistence.document_repository import RagDocumentRepository
from app.modules.rag.persistence.job_repository import RagJobRepository, RagJobRow
from app.modules.rag.persistence.query_repository import RagQueryRepository
from app.modules.rag.persistence.schema_repository import RagSchemaRepository
from app.modules.rag.persistence.session_repository import RagSessionRepository
from app.modules.shared.db import get_engine
class RagRepository:
def __init__(self) -> None:
self._schema = RagSchemaRepository()
self._sessions = RagSessionRepository()
self._jobs = RagJobRepository()
self._documents = RagDocumentRepository()
self._cache = RagCacheRepository()
self._query = RagQueryRepository()
def ensure_tables(self) -> None:
self._schema.ensure_tables()
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
self._sessions.upsert_session(rag_session_id, project_id)
def session_exists(self, rag_session_id: str) -> bool:
return self._sessions.session_exists(rag_session_id)
def get_session(self, rag_session_id: str) -> dict | None:
return self._sessions.get_session(rag_session_id)
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
self._jobs.create_job(index_job_id, rag_session_id, status)
def update_job(self, index_job_id: str, **kwargs) -> None:
self._jobs.update_job(index_job_id, **kwargs)
def get_job(self, index_job_id: str) -> RagJobRow | None:
return self._jobs.get_job(index_job_id)
def replace_documents(self, rag_session_id: str, docs: list[RagDocument]) -> None:
with get_engine().connect() as conn:
self._documents.replace_documents(conn, rag_session_id, docs)
conn.commit()
def apply_document_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
with get_engine().connect() as conn:
self._documents.apply_document_changes(conn, rag_session_id, delete_paths, docs)
conn.commit()
def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]:
return self._cache.get_cached_documents(repo_id, blob_sha)
def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None:
self._cache.cache_documents(repo_id, path, blob_sha, docs)
def record_repo_cache(self, **kwargs) -> None:
self._cache.record_repo_cache(**kwargs)
def retrieve(
self,
rag_session_id: str,
query_embedding: list[float],
*,
query_text: str = "",
limit: int = 5,
layers: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
return self._query.retrieve(
rag_session_id,
query_embedding,
query_text=query_text,
limit=limit,
layers=layers,
prefer_non_tests=prefer_non_tests,
)
def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers)

View File

@@ -0,0 +1,179 @@
from __future__ import annotations
from sqlalchemy import text
from app.modules.shared.db import get_engine
class RagSchemaRepository:
def ensure_tables(self) -> None:
engine = get_engine()
with engine.connect() as conn:
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_sessions (
rag_session_id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(512) NOT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_index_jobs (
index_job_id VARCHAR(64) PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
status VARCHAR(16) NOT NULL,
indexed_files INTEGER NOT NULL DEFAULT 0,
failed_files INTEGER NOT NULL DEFAULT 0,
cache_hit_files INTEGER NOT NULL DEFAULT 0,
cache_miss_files INTEGER NOT NULL DEFAULT 0,
error_code VARCHAR(128) NULL,
error_desc TEXT NULL,
error_module VARCHAR(64) NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_chunks (
id BIGSERIAL PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
path TEXT NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_blob_cache (
id BIGSERIAL PRIMARY KEY,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
path TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
id BIGSERIAL PRIMARY KEY,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
id BIGSERIAL PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
chunk_index INTEGER NOT NULL,
path TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
self._ensure_columns(conn)
self._ensure_indexes(conn)
conn.commit()
def _ensure_columns(self, conn) -> None:
for statement in (
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS qname TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS kind TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS framework TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS entrypoint_type TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS module_id TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section_path TEXT NULL",
"ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_kind TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_id TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_version TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS owner TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS system_component TEXT NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
"ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS section TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS path TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS title TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS links_json TEXT NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_start INTEGER NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_end INTEGER NULL",
"ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL",
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0",
"ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0",
):
conn.execute(text(statement))
def _ensure_indexes(self, conn) -> None:
for statement in (
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer ON rag_chunks (rag_session_id, layer)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer_path ON rag_chunks (rag_session_id, layer, path)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_qname ON rag_chunks (qname)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_symbol_id ON rag_chunks (symbol_id)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_module_id ON rag_chunks (module_id)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc_kind ON rag_chunks (doc_kind)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunks_entrypoint ON rag_chunks (entrypoint_type, framework)",
"CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)",
"CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)",
"CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)",
):
conn.execute(text(statement))

View File

@@ -0,0 +1,37 @@
from __future__ import annotations
from sqlalchemy import text
from app.modules.shared.db import get_engine
class RagSessionRepository:
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
INSERT INTO rag_sessions (rag_session_id, project_id)
VALUES (:sid, :pid)
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
"""
),
{"sid": rag_session_id, "pid": project_id},
)
conn.commit()
def session_exists(self, rag_session_id: str) -> bool:
with get_engine().connect() as conn:
row = conn.execute(
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
{"sid": rag_session_id},
).fetchone()
return bool(row)
def get_session(self, rag_session_id: str) -> dict | None:
with get_engine().connect() as conn:
row = conn.execute(
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
{"sid": rag_session_id},
).mappings().fetchone()
return dict(row) if row else None

View File

@@ -0,0 +1,43 @@
from __future__ import annotations
from app.modules.rag.contracts import RagLayer, RetrievalMode
class RagQueryRouter:
_CODE_HINTS = (
"как работает код",
"explain code",
"explain the code",
"по коду",
"из кода",
"построй документацию по коду",
"документацию по коду",
"where is implemented",
"где реализовано",
"endpoint",
"handler",
"symbol",
"function",
"class",
"method",
)
_DOCS_LAYERS = [
RagLayer.DOCS_MODULE_CATALOG,
RagLayer.DOCS_FACT_INDEX,
RagLayer.DOCS_SECTION_INDEX,
RagLayer.DOCS_POLICY_INDEX,
]
_CODE_LAYERS = [
RagLayer.CODE_ENTRYPOINTS,
RagLayer.CODE_SYMBOL_CATALOG,
RagLayer.CODE_DEPENDENCY_GRAPH,
RagLayer.CODE_SOURCE_CHUNKS,
]
def resolve_mode(self, query: str) -> str:
lowered = query.lower()
return RetrievalMode.CODE if any(hint in lowered for hint in self._CODE_HINTS) else RetrievalMode.DOCS
def layers_for_mode(self, mode: str) -> list[str]:
return list(self._CODE_LAYERS if mode == RetrievalMode.CODE else self._DOCS_LAYERS)

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
import re
def extract_query_terms(query_text: str) -> list[str]:
raw_terms = re.findall(r"[A-Za-z_][A-Za-z0-9_]{2,}", query_text or "")
normalized: list[str] = []
for term in raw_terms:
for variant in _identifier_variants(term):
if variant not in normalized:
normalized.append(variant)
for variant in _intent_variants(query_text):
if variant not in normalized:
normalized.append(variant)
return normalized[:6]
def _identifier_variants(term: str) -> list[str]:
lowered = term.lower()
variants = [lowered]
snake = _camel_to_snake(term)
if snake and snake not in variants:
variants.append(snake)
if lowered.endswith("manager") and len(lowered) > len("manager"):
manager_split = lowered[: -len("manager")] + "_manager"
if manager_split not in variants:
variants.append(manager_split)
compact = snake.replace("_", "") if snake else ""
if compact and compact not in variants:
variants.append(compact)
return variants
def _camel_to_snake(term: str) -> str:
first = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", term)
return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", first).lower()
def _intent_variants(query_text: str) -> list[str]:
lowered = (query_text or "").lower()
variants: list[str] = []
if any(token in lowered for token in ("управ", "control", "manage", "management")):
variants.extend(["control", "management", "start", "stop", "status"])
return variants

View File

@@ -0,0 +1,197 @@
from __future__ import annotations
import asyncio
import hashlib
import os
from collections.abc import Awaitable, Callable
from inspect import isawaitable
from app.modules.rag.contracts import RagDocument
from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline
from app.modules.rag.indexing.common.report import IndexReport
from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline
from app.modules.rag.persistence.repository import RagRepository
from app.modules.rag.retrieval.query_router import RagQueryRouter
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder
class RagService:
def __init__(
self,
embedder: GigaChatEmbedder,
repository: RagRepository,
chunker=None,
) -> None:
self._embedder = embedder
self._repo = repository
self._docs = DocsIndexingPipeline()
self._code = CodeIndexingPipeline()
self._queries = RagQueryRouter()
async def index_snapshot(
self,
rag_session_id: str,
files: list[dict],
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
) -> tuple[int, int, int, int]:
report = await self._index_files(rag_session_id, files, progress_cb=progress_cb)
self._repo.replace_documents(rag_session_id, report.documents_list)
return report.as_tuple()
async def index_changes(
self,
rag_session_id: str,
changed_files: list[dict],
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
) -> tuple[int, int, int, int]:
delete_paths: list[str] = []
upserts: list[dict] = []
for item in changed_files:
if str(item.get("op")) == "delete":
delete_paths.append(str(item.get("path", "")))
else:
upserts.append(item)
report = await self._index_files(rag_session_id, upserts, progress_cb=progress_cb)
self._repo.apply_document_changes(rag_session_id, delete_paths, report.documents_list)
return report.as_tuple()
async def retrieve(self, rag_session_id: str, query: str) -> list[dict]:
mode = self._queries.resolve_mode(query)
layers = self._queries.layers_for_mode(mode)
prefer_non_tests = mode == "code" and "test" not in query.lower() and "тест" not in query.lower()
try:
query_embedding = self._embedder.embed([query])[0]
rows = self._repo.retrieve(
rag_session_id,
query_embedding,
query_text=query,
limit=8,
layers=layers,
prefer_non_tests=prefer_non_tests,
)
except Exception:
rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=layers)
if not rows and mode != "docs":
rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=self._queries.layers_for_mode("docs"))
return [
{
"source": row["path"],
"content": row["content"],
"layer": row.get("layer"),
"title": row.get("title"),
"metadata": row.get("metadata", {}),
"score": row.get("distance"),
}
for row in rows
]
async def _index_files(
self,
rag_session_id: str,
files: list[dict],
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
) -> "_PipelineReport":
total_files = len(files)
report = _PipelineReport()
repo_id = self._resolve_repo_id(rag_session_id)
for index, file in enumerate(files, start=1):
path = str(file.get("path", ""))
try:
blob_sha = self._blob_sha(file)
cached = await asyncio.to_thread(self._repo.get_cached_documents, repo_id, blob_sha)
if cached:
report.documents_list.extend(self._with_file_metadata(cached, file, repo_id, blob_sha))
report.cache_hit_files += 1
else:
built = self._build_documents(repo_id, path, file)
embedded = await asyncio.to_thread(self._embed_documents, built, file, repo_id, blob_sha)
report.documents_list.extend(embedded)
await asyncio.to_thread(self._repo.cache_documents, repo_id, path, blob_sha, embedded)
report.cache_miss_files += 1
report.indexed_files += 1
except Exception as exc:
report.failed_files += 1
report.warnings.append(f"{path}: {exc}")
await self._notify_progress(progress_cb, index, total_files, path)
report.documents = len(report.documents_list)
return report
def _build_documents(self, repo_id: str, path: str, file: dict) -> list[RagDocument]:
content = str(file.get("content") or "")
commit_sha = file.get("commit_sha")
docs: list[RagDocument] = []
if self._docs.supports(path):
docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
if self._code.supports(path):
docs.extend(self._code.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
if not docs:
docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content))
return docs
def _embed_documents(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]:
if not docs:
return []
batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16")))
metadata = self._document_metadata(file, repo_id, blob_sha)
for doc in docs:
doc.metadata.update(metadata)
for start in range(0, len(docs), batch_size):
batch = docs[start : start + batch_size]
vectors = self._embedder.embed([doc.text for doc in batch])
for doc, vector in zip(batch, vectors):
doc.embedding = vector
return docs
def _with_file_metadata(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]:
metadata = self._document_metadata(file, repo_id, blob_sha)
for doc in docs:
doc.metadata.update(metadata)
doc.source.repo_id = repo_id
doc.source.path = str(file.get("path", doc.source.path))
return docs
def _document_metadata(self, file: dict, repo_id: str, blob_sha: str) -> dict:
return {
"blob_sha": blob_sha,
"repo_id": repo_id,
"artifact_type": file.get("artifact_type"),
"section": file.get("section"),
"doc_id": file.get("doc_id"),
"doc_version": file.get("doc_version"),
"owner": file.get("owner"),
"system_component": file.get("system_component"),
"last_modified": file.get("last_modified"),
"staleness_score": file.get("staleness_score"),
}
def _resolve_repo_id(self, rag_session_id: str) -> str:
session = self._repo.get_session(rag_session_id)
if not session:
return rag_session_id
return str(session.get("project_id") or rag_session_id)
def _blob_sha(self, file: dict) -> str:
raw = str(file.get("content_hash") or "").strip()
if raw:
return raw
content = str(file.get("content") or "")
return hashlib.sha256(content.encode("utf-8")).hexdigest()
async def _notify_progress(
self,
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None,
current_file_index: int,
total_files: int,
current_file_name: str,
) -> None:
if not progress_cb:
return
result = progress_cb(current_file_index, total_files, current_file_name)
if isawaitable(result):
await result
class _PipelineReport(IndexReport):
def __init__(self) -> None:
super().__init__()
self.documents_list: list[RagDocument] = []

View File

@@ -3,8 +3,8 @@ from __future__ import annotations
from fastapi import APIRouter, Request from fastapi import APIRouter, Request
from app.modules.agent.story_context_repository import StoryContextRepository from app.modules.agent.story_context_repository import StoryContextRepository
from app.modules.rag.persistence.repository import RagRepository
from app.modules.rag_repo.webhook_service import RepoWebhookService from app.modules.rag_repo.webhook_service import RepoWebhookService
from app.modules.rag_session.repository import RagRepository
class RagRepoModule: class RagRepoModule:

View File

@@ -1,660 +1,4 @@
from __future__ import annotations from app.modules.rag.persistence.job_repository import RagJobRow
from app.modules.rag.persistence.repository import RagRepository
from dataclasses import dataclass __all__ = ["RagJobRow", "RagRepository"]
import hashlib
from sqlalchemy import text
from app.modules.shared.db import get_engine
@dataclass
class RagJobRow:
index_job_id: str
rag_session_id: str
status: str
indexed_files: int
failed_files: int
cache_hit_files: int
cache_miss_files: int
error_code: str | None
error_desc: str | None
error_module: str | None
class RagRepository:
def ensure_tables(self) -> None:
engine = get_engine()
with engine.connect() as conn:
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_sessions (
rag_session_id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(512) NOT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_index_jobs (
index_job_id VARCHAR(64) PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
status VARCHAR(16) NOT NULL,
indexed_files INTEGER NOT NULL DEFAULT 0,
failed_files INTEGER NOT NULL DEFAULT 0,
cache_hit_files INTEGER NOT NULL DEFAULT 0,
cache_miss_files INTEGER NOT NULL DEFAULT 0,
error_code VARCHAR(128) NULL,
error_desc TEXT NULL,
error_module VARCHAR(64) NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_chunks (
id BIGSERIAL PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
path TEXT NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL"))
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL"))
conn.execute(
text(
"""
ALTER TABLE rag_chunks
ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
"""
)
)
conn.execute(
text(
"""
ALTER TABLE rag_chunks
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_blob_cache (
id BIGSERIAL PRIMARY KEY,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
path TEXT NOT NULL,
artifact_type VARCHAR(16) NULL,
section TEXT NULL,
doc_id TEXT NULL,
doc_version TEXT NULL,
owner TEXT NULL,
system_component TEXT NULL,
last_modified TIMESTAMPTZ NULL,
staleness_score DOUBLE PRECISION NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
id BIGSERIAL PRIMARY KEY,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector NULL,
section TEXT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
id BIGSERIAL PRIMARY KEY,
rag_session_id VARCHAR(64) NOT NULL,
repo_id VARCHAR(512) NOT NULL,
blob_sha VARCHAR(128) NOT NULL,
chunk_index INTEGER NOT NULL,
path TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
)
"""
)
)
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)"))
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)"))
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0"))
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0"))
conn.commit()
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
INSERT INTO rag_sessions (rag_session_id, project_id)
VALUES (:sid, :pid)
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
"""
),
{"sid": rag_session_id, "pid": project_id},
)
conn.commit()
def session_exists(self, rag_session_id: str) -> bool:
with get_engine().connect() as conn:
row = conn.execute(
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
{"sid": rag_session_id},
).fetchone()
return bool(row)
def get_session(self, rag_session_id: str) -> dict | None:
with get_engine().connect() as conn:
row = conn.execute(
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
{"sid": rag_session_id},
).mappings().fetchone()
return dict(row) if row else None
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
VALUES (:jid, :sid, :status)
"""
),
{"jid": index_job_id, "sid": rag_session_id, "status": status},
)
conn.commit()
def update_job(
self,
index_job_id: str,
*,
status: str,
indexed_files: int,
failed_files: int,
cache_hit_files: int = 0,
cache_miss_files: int = 0,
error_code: str | None = None,
error_desc: str | None = None,
error_module: str | None = None,
) -> None:
with get_engine().connect() as conn:
conn.execute(
text(
"""
UPDATE rag_index_jobs
SET status = :status,
indexed_files = :indexed,
failed_files = :failed,
cache_hit_files = :cache_hit_files,
cache_miss_files = :cache_miss_files,
error_code = :ecode,
error_desc = :edesc,
error_module = :emodule,
updated_at = CURRENT_TIMESTAMP
WHERE index_job_id = :jid
"""
),
{
"jid": index_job_id,
"status": status,
"indexed": indexed_files,
"failed": failed_files,
"cache_hit_files": cache_hit_files,
"cache_miss_files": cache_miss_files,
"ecode": error_code,
"edesc": error_desc,
"emodule": error_module,
},
)
conn.commit()
def get_job(self, index_job_id: str) -> RagJobRow | None:
with get_engine().connect() as conn:
row = conn.execute(
text(
"""
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
FROM rag_index_jobs
WHERE index_job_id = :jid
"""
),
{"jid": index_job_id},
).mappings().fetchone()
if not row:
return None
return RagJobRow(**dict(row))
def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None:
with get_engine().connect() as conn:
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
self._insert_chunks(conn, rag_session_id, items)
conn.commit()
def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None:
with get_engine().connect() as conn:
if delete_paths:
conn.execute(
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": delete_paths},
)
conn.execute(
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": delete_paths},
)
if upserts:
paths = sorted({str(x["path"]) for x in upserts})
conn.execute(
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": paths},
)
conn.execute(
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
{"sid": rag_session_id, "paths": paths},
)
self._insert_chunks(conn, rag_session_id, upserts)
conn.commit()
def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]:
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT chunk_index, content, embedding::text AS embedding_txt, section
FROM rag_chunk_cache
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
ORDER BY chunk_index ASC
"""
),
{"repo_id": repo_id, "blob_sha": blob_sha},
).mappings().fetchall()
output: list[dict] = []
for row in rows:
output.append(
{
"chunk_index": int(row["chunk_index"]),
"content": str(row["content"] or ""),
"embedding": self._parse_vector(str(row["embedding_txt"] or "")),
"section": row.get("section"),
}
)
return output
def record_repo_cache(
self,
*,
project_id: str,
commit_sha: str | None,
changed_files: list[str],
summary: str,
) -> None:
repo_session_id = f"repo:{project_id}"
with get_engine().connect() as conn:
for path in changed_files:
key = f"{commit_sha or 'no-commit'}:{path}"
blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest()
conn.execute(
text(
"""
INSERT INTO rag_blob_cache (
repo_id,
blob_sha,
path,
artifact_type,
section
)
VALUES (
:repo_id,
:blob_sha,
:path,
:artifact_type,
:section
)
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
updated_at = CURRENT_TIMESTAMP
"""
),
{
"repo_id": project_id,
"blob_sha": blob_sha,
"path": path,
"artifact_type": "CODE",
"section": "repo_webhook",
},
)
conn.execute(
text(
"""
INSERT INTO rag_chunk_cache (
repo_id,
blob_sha,
chunk_index,
content,
embedding,
section
)
VALUES (
:repo_id,
:blob_sha,
0,
:content,
NULL,
:section
)
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
content = EXCLUDED.content,
section = EXCLUDED.section,
updated_at = CURRENT_TIMESTAMP
"""
),
{
"repo_id": project_id,
"blob_sha": blob_sha,
"content": f"repo_webhook:{path}:{summary[:300]}",
"section": "repo_webhook",
},
)
conn.execute(
text(
"""
INSERT INTO rag_session_chunk_map (
rag_session_id,
repo_id,
blob_sha,
chunk_index,
path
)
VALUES (
:rag_session_id,
:repo_id,
:blob_sha,
0,
:path
)
"""
),
{
"rag_session_id": repo_session_id,
"repo_id": project_id,
"blob_sha": blob_sha,
"path": path,
},
)
conn.commit()
def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None:
if not items:
return
meta = items[0]
with get_engine().connect() as conn:
conn.execute(
text(
"""
INSERT INTO rag_blob_cache (
repo_id,
blob_sha,
path,
artifact_type,
section,
doc_id,
doc_version,
owner,
system_component,
last_modified,
staleness_score
)
VALUES (
:repo_id,
:blob_sha,
:path,
:artifact_type,
:section,
:doc_id,
:doc_version,
:owner,
:system_component,
:last_modified,
:staleness_score
)
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
artifact_type = EXCLUDED.artifact_type,
section = EXCLUDED.section,
doc_id = EXCLUDED.doc_id,
doc_version = EXCLUDED.doc_version,
owner = EXCLUDED.owner,
system_component = EXCLUDED.system_component,
last_modified = EXCLUDED.last_modified,
staleness_score = EXCLUDED.staleness_score,
updated_at = CURRENT_TIMESTAMP
"""
),
{
"repo_id": repo_id,
"blob_sha": blob_sha,
"path": path,
"artifact_type": meta.get("artifact_type"),
"section": meta.get("section"),
"doc_id": meta.get("doc_id"),
"doc_version": meta.get("doc_version"),
"owner": meta.get("owner"),
"system_component": meta.get("system_component"),
"last_modified": meta.get("last_modified"),
"staleness_score": meta.get("staleness_score"),
},
)
for item in items:
emb = item.get("embedding") or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
text(
"""
INSERT INTO rag_chunk_cache (
repo_id,
blob_sha,
chunk_index,
content,
embedding,
section
)
VALUES (
:repo_id,
:blob_sha,
:chunk_index,
:content,
CAST(:embedding AS vector),
:section
)
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
content = EXCLUDED.content,
embedding = EXCLUDED.embedding,
section = EXCLUDED.section,
updated_at = CURRENT_TIMESTAMP
"""
),
{
"repo_id": repo_id,
"blob_sha": blob_sha,
"chunk_index": int(item["chunk_index"]),
"content": item["content"],
"embedding": emb_str,
"section": item.get("section"),
},
)
conn.commit()
def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT path, content
FROM rag_chunks
WHERE rag_session_id = :sid
ORDER BY embedding <=> CAST(:emb AS vector)
LIMIT :lim
"""
),
{"sid": rag_session_id, "emb": emb, "lim": limit},
).mappings().fetchall()
return [dict(x) for x in rows]
def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]:
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT path, content
FROM rag_chunks
WHERE rag_session_id = :sid
ORDER BY id DESC
LIMIT :lim
"""
),
{"sid": rag_session_id, "lim": limit},
).mappings().fetchall()
return [dict(x) for x in rows]
def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None:
for item in items:
emb = item.get("embedding") or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
text(
"""
INSERT INTO rag_chunks (
rag_session_id,
path,
chunk_index,
content,
embedding,
artifact_type,
section,
doc_id,
doc_version,
owner,
system_component,
last_modified,
staleness_score,
created_at,
updated_at
)
VALUES (
:sid,
:path,
:idx,
:content,
CAST(:emb AS vector),
:artifact_type,
:section,
:doc_id,
:doc_version,
:owner,
:system_component,
:last_modified,
:staleness_score,
CURRENT_TIMESTAMP,
CURRENT_TIMESTAMP
)
"""
),
{
"sid": rag_session_id,
"path": item["path"],
"idx": int(item["chunk_index"]),
"content": item["content"],
"emb": emb_str,
"artifact_type": item.get("artifact_type"),
"section": item.get("section"),
"doc_id": item.get("doc_id"),
"doc_version": item.get("doc_version"),
"owner": item.get("owner"),
"system_component": item.get("system_component"),
"last_modified": item.get("last_modified"),
"staleness_score": item.get("staleness_score"),
},
)
repo_id = str(item.get("repo_id") or "").strip()
blob_sha = str(item.get("blob_sha") or "").strip()
if repo_id and blob_sha:
conn.execute(
text(
"""
INSERT INTO rag_session_chunk_map (
rag_session_id,
repo_id,
blob_sha,
chunk_index,
path
) VALUES (
:sid,
:repo_id,
:blob_sha,
:chunk_index,
:path
)
"""
),
{
"sid": rag_session_id,
"repo_id": repo_id,
"blob_sha": blob_sha,
"chunk_index": int(item["chunk_index"]),
"path": item["path"],
},
)
def _parse_vector(self, value: str) -> list[float]:
text_value = value.strip()
if not text_value:
return []
if text_value.startswith("[") and text_value.endswith("]"):
text_value = text_value[1:-1]
if not text_value:
return []
return [float(part.strip()) for part in text_value.split(",") if part.strip()]

View File

@@ -1,211 +1,3 @@
import asyncio from app.modules.rag.services.rag_service import RagService
import hashlib
import os
from collections.abc import Awaitable, Callable
from inspect import isawaitable
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder __all__ = ["RagService"]
from app.modules.rag_session.repository import RagRepository
from app.modules.rag_session.retrieval.chunker import TextChunker
class RagService:
def __init__(
self,
embedder: GigaChatEmbedder,
repository: RagRepository,
chunker: TextChunker | None = None,
) -> None:
self._embedder = embedder
self._repo = repository
self._chunker = chunker or TextChunker()
async def index_snapshot(
self,
rag_session_id: str,
files: list[dict],
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
) -> tuple[int, int, int, int]:
total_files = len(files)
indexed_files = 0
failed_files = 0
cache_hit_files = 0
cache_miss_files = 0
all_chunks: list[dict] = []
repo_id = self._resolve_repo_id(rag_session_id)
for index, file in enumerate(files, start=1):
path = str(file.get("path", ""))
try:
blob_sha = self._blob_sha(file)
cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha)
if cached:
all_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached))
cache_hit_files += 1
else:
chunks = self._build_chunks_for_file(file)
embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha)
all_chunks.extend(embedded_chunks)
await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks)
cache_miss_files += 1
indexed_files += 1
except Exception:
failed_files += 1
await self._notify_progress(progress_cb, index, total_files, path)
await asyncio.to_thread(self._repo.replace_chunks, rag_session_id, all_chunks)
return indexed_files, failed_files, cache_hit_files, cache_miss_files
async def index_changes(
self,
rag_session_id: str,
changed_files: list[dict],
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None,
) -> tuple[int, int, int, int]:
total_files = len(changed_files)
indexed_files = 0
failed_files = 0
cache_hit_files = 0
cache_miss_files = 0
delete_paths: list[str] = []
upsert_chunks: list[dict] = []
repo_id = self._resolve_repo_id(rag_session_id)
for index, file in enumerate(changed_files, start=1):
path = str(file.get("path", ""))
op = str(file.get("op", ""))
try:
if op == "delete":
delete_paths.append(path)
indexed_files += 1
await self._notify_progress(progress_cb, index, total_files, path)
continue
if op == "upsert" and file.get("content") is not None:
blob_sha = self._blob_sha(file)
cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha)
if cached:
upsert_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached))
cache_hit_files += 1
else:
chunks = self._build_chunks_for_file(file)
embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha)
upsert_chunks.extend(embedded_chunks)
await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks)
cache_miss_files += 1
indexed_files += 1
await self._notify_progress(progress_cb, index, total_files, path)
continue
failed_files += 1
except Exception:
failed_files += 1
await self._notify_progress(progress_cb, index, total_files, path)
await asyncio.to_thread(
self._repo.apply_changes,
rag_session_id,
delete_paths,
upsert_chunks,
)
return indexed_files, failed_files, cache_hit_files, cache_miss_files
async def retrieve(self, rag_session_id: str, query: str) -> list[dict]:
try:
query_embedding = self._embedder.embed([query])[0]
rows = self._repo.retrieve(rag_session_id, query_embedding, limit=5)
except Exception:
rows = self._repo.fallback_chunks(rag_session_id, limit=5)
return [{"source": row["path"], "content": row["content"]} for row in rows]
def _build_chunks_for_file(self, file: dict) -> list[tuple[str, int, str]]:
path = str(file.get("path", ""))
content = str(file.get("content", ""))
output: list[tuple[str, int, str]] = []
for idx, chunk in enumerate(self._chunker.chunk(content)):
output.append((path, idx, chunk))
return output
def _embed_chunks(self, raw_chunks: list[tuple[str, int, str]], file: dict, repo_id: str, blob_sha: str) -> list[dict]:
if not raw_chunks:
return []
batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16")))
metadata = self._chunk_metadata(file)
indexed: list[dict] = []
for i in range(0, len(raw_chunks), batch_size):
batch = raw_chunks[i : i + batch_size]
texts = [x[2] for x in batch]
vectors = self._embedder.embed(texts)
for (path, chunk_index, content), vector in zip(batch, vectors):
indexed.append(
{
"path": path,
"chunk_index": chunk_index,
"content": content,
"embedding": vector,
"repo_id": repo_id,
"blob_sha": blob_sha,
**metadata,
}
)
return indexed
def _build_cached_items(
self,
path: str,
file: dict,
repo_id: str,
blob_sha: str,
cached: list[dict],
) -> list[dict]:
metadata = self._chunk_metadata(file)
output: list[dict] = []
for item in cached:
output.append(
{
"path": path,
"chunk_index": int(item["chunk_index"]),
"content": str(item["content"]),
"embedding": item.get("embedding") or [],
"repo_id": repo_id,
"blob_sha": blob_sha,
**metadata,
"section": item.get("section") or metadata.get("section"),
}
)
return output
def _resolve_repo_id(self, rag_session_id: str) -> str:
session = self._repo.get_session(rag_session_id)
if not session:
return rag_session_id
return str(session.get("project_id") or rag_session_id)
def _blob_sha(self, file: dict) -> str:
raw = str(file.get("content_hash") or "").strip()
if raw:
return raw
content = str(file.get("content") or "")
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def _chunk_metadata(self, file: dict) -> dict:
return {
"artifact_type": file.get("artifact_type"),
"section": file.get("section"),
"doc_id": file.get("doc_id"),
"doc_version": file.get("doc_version"),
"owner": file.get("owner"),
"system_component": file.get("system_component"),
"last_modified": file.get("last_modified"),
"staleness_score": file.get("staleness_score"),
}
async def _notify_progress(
self,
progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None,
current_file_index: int,
total_files: int,
current_file_name: str,
) -> None:
if not progress_cb:
return
result = progress_cb(current_file_index, total_files, current_file_name)
if isawaitable(result):
await result

View File

@@ -0,0 +1,131 @@
from app.modules.agent.engine.orchestrator.actions.explain_actions import ExplainActions
from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext
from app.modules.agent.engine.orchestrator.models import (
ExecutionPlan,
OutputContract,
RoutingMeta,
Scenario,
TaskConstraints,
TaskSpec,
)
def _ctx(rag_items: list[dict]) -> ExecutionContext:
task = TaskSpec(
task_id="task-1",
dialog_session_id="dialog-1",
rag_session_id="rag-1",
user_message="Объясни по коду как работает task_processor",
scenario=Scenario.EXPLAIN_PART,
routing=RoutingMeta(domain_id="project", process_id="qa", confidence=0.9, reason="test"),
constraints=TaskConstraints(),
output_contract=OutputContract(result_type="answer"),
metadata={
"rag_items": rag_items,
"rag_context": "",
"confluence_context": "",
"files_map": {},
},
)
plan = ExecutionPlan(
plan_id="plan-1",
task_id="task-1",
scenario=Scenario.EXPLAIN_PART,
template_id="tpl",
template_version="1",
steps=[],
)
return ExecutionContext(task=task, plan=plan, graph_resolver=lambda *_: None, graph_invoker=lambda *_: {})
def test_explain_actions_switch_to_code_profile_when_code_layers_present() -> None:
ctx = _ctx(
[
{
"source": "app/task_processor.py",
"layer": "C1_SYMBOL_CATALOG",
"title": "task_processor.process_task",
"content": "function task_processor.process_task(task)",
"metadata": {"qname": "task_processor.process_task", "kind": "function"},
},
{
"source": "app/task_processor.py",
"layer": "C2_DEPENDENCY_GRAPH",
"title": "task_processor.process_task:calls",
"content": "task_processor.process_task calls queue.publish",
"metadata": {"edge_type": "calls"},
},
]
)
actions = ExplainActions()
actions.collect_sources(ctx)
actions.extract_logic(ctx)
actions.summarize(ctx)
sources = ctx.artifacts.get_content("sources", {})
assert sources["source_profile"] == "code"
answer = str(ctx.artifacts.get_content("final_answer", ""))
assert "кодовых слоев индекса" not in answer
assert "CodeRAG" not in answer
assert "app/task_processor.py" in answer
assert "requirements/docs context" not in answer
def test_explain_actions_add_code_details_block() -> None:
ctx = _ctx(
[
{
"source": "src/config_manager/__init__.py",
"layer": "C1_SYMBOL_CATALOG",
"title": "ConfigManager",
"content": "const ConfigManager\nConfigManager = config_manager.v2.ConfigManagerV2",
"metadata": {
"qname": "ConfigManager",
"kind": "const",
"lang_payload": {"imported_from": "v2.ConfigManagerV2", "import_alias": True},
},
},
{
"source": "src/config_manager/v2/control/base.py",
"layer": "C1_SYMBOL_CATALOG",
"title": "ControlChannel",
"content": "class ControlChannel\nControlChannel(ABC)",
"metadata": {"qname": "ControlChannel", "kind": "class"},
},
{
"source": "src/config_manager/v2/core/control_bridge.py",
"layer": "C1_SYMBOL_CATALOG",
"title": "ControlChannelBridge",
"content": "class ControlChannelBridge\nПредоставляет halt и status как обработчики start/stop/status",
"metadata": {"qname": "ControlChannelBridge", "kind": "class"},
},
{
"source": "src/config_manager/v2/core/control_bridge.py",
"layer": "C2_DEPENDENCY_GRAPH",
"title": "ControlChannelBridge.on_start:calls",
"content": "ControlChannelBridge.on_start calls self._start_runtime",
"metadata": {"src_qname": "ControlChannelBridge.on_start", "dst_ref": "self._start_runtime"},
},
{
"source": "src/config_manager/v2/__init__.py",
"layer": "C0_SOURCE_CHUNKS",
"title": "src/config_manager/v2/__init__.py:1-6",
"content": '"""Контракт: управление через API (config.yaml, секция management)."""',
"metadata": {},
},
]
)
actions = ExplainActions()
actions.collect_sources(ctx)
actions.extract_logic(ctx)
actions.summarize(ctx)
answer = str(ctx.artifacts.get_content("final_answer", ""))
assert "### Что видно по коду" in answer
assert "ConfigManager` в проекте доступен как alias" in answer
assert "ControlChannelBridge.on_start" in answer
assert "### Где смотреть в проекте" in answer
assert "В индексе нет точного символа" not in answer
assert "отдельный интерфейс управления" in answer

View File

@@ -0,0 +1,57 @@
from app.modules.rag.contracts.enums import RagLayer
from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline
def test_code_pipeline_builds_source_symbols_edges_and_entrypoints() -> None:
pipeline = CodeIndexingPipeline()
content = """
from fastapi import APIRouter
router = APIRouter()
class UserService:
def get_user(self, user_id):
return user_id
@router.get("/users/{user_id}")
async def get_user(user_id: str):
service = UserService()
return service.get_user(user_id)
"""
docs = pipeline.index_file(
repo_id="acme/proj",
commit_sha="abc123",
path="app/api/users.py",
content=content,
)
layers = {doc.layer for doc in docs}
assert RagLayer.CODE_SOURCE_CHUNKS in layers
assert RagLayer.CODE_SYMBOL_CATALOG in layers
assert RagLayer.CODE_DEPENDENCY_GRAPH in layers
assert RagLayer.CODE_ENTRYPOINTS in layers
symbol_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["kind"] == "function")
assert "get_user" in symbol_doc.metadata["qname"]
edge_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_DEPENDENCY_GRAPH)
assert edge_doc.metadata["edge_type"] in {"calls", "imports", "inherits"}
entry_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_ENTRYPOINTS)
assert entry_doc.metadata["framework"] == "fastapi"
def test_code_pipeline_indexes_import_alias_as_symbol() -> None:
pipeline = CodeIndexingPipeline()
content = "from .v2 import ConfigManagerV2 as ConfigManager\n"
docs = pipeline.index_file(
repo_id="acme/proj",
commit_sha="abc123",
path="src/config_manager/__init__.py",
content=content,
)
alias_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["qname"] == "ConfigManager")
assert alias_doc.metadata["kind"] == "const"
assert alias_doc.metadata["lang_payload"]["import_alias"] is True

View File

@@ -0,0 +1,63 @@
from app.modules.rag.contracts.enums import RagLayer
from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline
def test_docs_pipeline_builds_catalog_facts_sections_and_policy() -> None:
pipeline = DocsIndexingPipeline()
content = """---
id: api.billing.create_invoice
type: policy
domain: billing
links:
calls_api:
- api.billing.validate_invoice
tags: [billing]
status: active
---
# Create Invoice
## Spec Summary
Creates an invoice in billing.
## Request Contract
| field | type | required | validation |
| --- | --- | --- | --- |
| amount | decimal | yes | > 0 |
## Error Matrix
| status | error | client action |
| --- | --- | --- |
| 400 | invalid_amount | fix request |
## Rules
- metric: billing.invoice.created
- rule: amount must be positive
"""
docs = pipeline.index_file(
repo_id="acme/proj",
commit_sha="abc123",
path="docs/billing/create_invoice.md",
content=content,
)
layers = {doc.layer for doc in docs}
assert RagLayer.DOCS_MODULE_CATALOG in layers
assert RagLayer.DOCS_FACT_INDEX in layers
assert RagLayer.DOCS_SECTION_INDEX in layers
assert RagLayer.DOCS_POLICY_INDEX in layers
module_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_MODULE_CATALOG)
assert module_doc.metadata["module_id"] == "api.billing.create_invoice"
assert module_doc.metadata["type"] == "policy"
fact_texts = [doc.text for doc in docs if doc.layer == RagLayer.DOCS_FACT_INDEX]
assert any("calls_api" in text for text in fact_texts)
assert any("has_field" in text for text in fact_texts)
assert any("returns_error" in text for text in fact_texts)
section_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_SECTION_INDEX)
assert section_doc.metadata["section_path"]

View File

@@ -0,0 +1,12 @@
from app.modules.rag.contracts.enums import RetrievalMode
from app.modules.rag.retrieval.query_router import RagQueryRouter
def test_query_router_uses_docs_by_default() -> None:
router = RagQueryRouter()
assert router.resolve_mode("Какие есть требования по биллингу?") == RetrievalMode.DOCS
def test_query_router_switches_to_code_on_explicit_code_requests() -> None:
router = RagQueryRouter()
assert router.resolve_mode("Объясни как работает код endpoint create invoice") == RetrievalMode.CODE

View File

@@ -0,0 +1,9 @@
from app.modules.rag.retrieval.query_terms import extract_query_terms
def test_extract_query_terms_from_code_question() -> None:
terms = extract_query_terms("Объясни по коду как можно управлять COnfigmanager?")
assert "configmanager" in terms
assert "config_manager" in terms
assert "control" in terms