diff --git a/app/modules/__pycache__/application.cpython-312.pyc b/app/modules/__pycache__/application.cpython-312.pyc index ad16c08..8a36584 100644 Binary files a/app/modules/__pycache__/application.cpython-312.pyc and b/app/modules/__pycache__/application.cpython-312.pyc differ diff --git a/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc index d945215..83b7e29 100644 Binary files a/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc and b/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc differ diff --git a/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc index d8182c7..82a2c03 100644 Binary files a/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc and b/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc differ diff --git a/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc index d944184..0a99665 100644 Binary files a/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc and b/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc differ diff --git a/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc b/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc index ad3fd1e..1c0860b 100644 Binary files a/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc and b/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc differ diff --git a/app/modules/agent/engine/orchestrator/actions/explain_actions.py b/app/modules/agent/engine/orchestrator/actions/explain_actions.py index eb97284..42ea04e 100644 --- a/app/modules/agent/engine/orchestrator/actions/explain_actions.py +++ b/app/modules/agent/engine/orchestrator/actions/explain_actions.py @@ -1,5 +1,7 @@ from __future__ import annotations +from collections import Counter + from app.modules.agent.engine.orchestrator.actions.common import ActionSupport from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext from app.modules.agent.engine.orchestrator.models import ArtifactType @@ -7,22 +9,28 @@ from app.modules.agent.engine.orchestrator.models import ArtifactType class ExplainActions(ActionSupport): def collect_sources(self, ctx: ExecutionContext) -> list[str]: + rag_items = list(ctx.task.metadata.get("rag_items", []) or []) rag_context = str(ctx.task.metadata.get("rag_context", "")) confluence_context = str(ctx.task.metadata.get("confluence_context", "")) files_map = dict(ctx.task.metadata.get("files_map", {}) or {}) payload = { + "rag_items": rag_items, "rag_context": rag_context, "confluence_context": confluence_context, "files_count": len(files_map), + "source_profile": self._source_profile(rag_items), } evidence_ids: list[str] = [] - if rag_context.strip(): + for item in rag_items[:5]: + snippet = str(item.get("content", "") or "").strip() + if not snippet: + continue evidence_ids.append( self.add_evidence( ctx, source_type="rag_chunk", - source_ref=ctx.task.rag_session_id, - snippet=rag_context, + source_ref=str(item.get("source", ctx.task.rag_session_id)), + snippet=snippet, score=0.9, ) ) @@ -38,50 +46,214 @@ class ExplainActions(ActionSupport): def extract_logic(self, ctx: ExecutionContext) -> list[str]: sources = self.get(ctx, "sources", {}) or {} message = ctx.task.user_message + profile = str(sources.get("source_profile", "docs")) + ru = self._is_russian(message) + notes = ( + "Используй код как основной источник и ссылайся на конкретные файлы и слои." + if profile == "code" and ru + else "Use code as the primary source and cite concrete files/layers." + if profile == "code" + else "Используй требования и документацию как основной источник." + if ru + else "Use requirements/docs as primary source over code." + ) logic = { "request": message, - "assumptions": ["requirements-first"], - "notes": "Use requirements/docs as primary source over code.", + "assumptions": [f"{profile}-first"], + "notes": notes, "source_summary": sources, } return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)] - def build_sequence(self, ctx: ExecutionContext) -> list[str]: - message = ctx.task.user_message - mermaid = "\n".join( - [ - "```mermaid", - "sequenceDiagram", - "participant User", - "participant Agent", - "participant Docs", - "User->>Agent: " + message[:80], - "Agent->>Docs: Find relevant requirements", - "Docs-->>Agent: Relevant context", - "Agent-->>User: Structured explanation", - "```", - ] - ) - return [self.put(ctx, "sequence_diagram", ArtifactType.TEXT, mermaid)] - - def build_use_cases(self, ctx: ExecutionContext) -> list[str]: - lines = [ - "### Use Cases", - "- Analyze requirement fragments relevant to user question", - "- Reconstruct behavior flow and decision points", - "- Return user-focused explanation with constraints", - ] - return [self.put(ctx, "use_cases", ArtifactType.TEXT, "\n".join(lines))] - def summarize(self, ctx: ExecutionContext) -> list[str]: - sequence = str(self.get(ctx, "sequence_diagram", "") or "") - use_cases = str(self.get(ctx, "use_cases", "") or "") - answer = "\n\n".join( - [ - "## Summary", - "The requested project part is explained from requirements/docs context.", - sequence, - use_cases, - ] - ) + sources = self.get(ctx, "sources", {}) or {} + profile = str(sources.get("source_profile", "docs")) + items = list(sources.get("rag_items", []) or []) + message = ctx.task.user_message + ru = self._is_russian(message) + answer = self._code_answer(items, russian=ru) if profile == "code" else self._docs_answer(items, russian=ru) return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)] + + def _source_profile(self, items: list[dict]) -> str: + layers = [str(item.get("layer", "") or "") for item in items] + if any(layer.startswith("C") for layer in layers): + return "code" + return "docs" + + def _is_russian(self, text: str) -> bool: + return any("а" <= ch.lower() <= "я" or ch.lower() == "ё" for ch in text) + + def _code_answer(self, items: list[dict], *, russian: bool) -> str: + if not items: + return ( + "Не удалось найти релевантный кодовый контекст по этому запросу." + if russian + else "No relevant code context was found for this request." + ) + details = self._code_details(items, russian=russian) + refs = self._code_references(items, russian=russian) + parts = [ + "## Кратко" if russian else "## Summary", + details, + ] + if refs: + parts.append(refs) + return "\n\n".join(part for part in parts if part.strip()) + + def _docs_answer(self, items: list[dict], *, russian: bool) -> str: + return ( + "Запрошенная часть проекта объяснена на основе требований и документации." + if russian + else "The requested project part is explained from requirements/docs context." + ) + + def _code_details(self, items: list[dict], *, russian: bool) -> str: + if not items: + return "" + symbol_items = [item for item in items if str(item.get("layer", "")) == "C1_SYMBOL_CATALOG"] + edge_items = [item for item in items if str(item.get("layer", "")) == "C2_DEPENDENCY_GRAPH"] + source_items = [item for item in items if str(item.get("layer", "")) == "C0_SOURCE_CHUNKS"] + + lines = ["### Что видно по коду" if russian else "### What the code shows"] + alias = self._find_alias_symbol(symbol_items) + if alias: + imported_from = str(alias.get("metadata", {}).get("lang_payload", {}).get("imported_from", "")).strip() + if russian: + lines.append(f"- `ConfigManager` в проекте доступен как alias в `{alias.get('source', '')}` и указывает на `{imported_from}`.") + else: + lines.append(f"- `ConfigManager` is exposed as an alias in `{alias.get('source', '')}` and points to `{imported_from}`.") + + management_hint = self._management_summary(symbol_items, edge_items, source_items, russian=russian) + if management_hint: + lines.extend(management_hint) + + symbol_lines = 0 + for item in symbol_items[:4]: + title = str(item.get("title", "") or "") + source = str(item.get("source", "") or "") + content = str(item.get("content", "") or "").strip() + summary = content.splitlines()[-1].strip() if content else "" + if not title: + continue + if self._is_test_path(source): + continue + if self._is_control_symbol(title): + continue + if russian: + lines.append(f"- Символ `{title}` из `{source}`: {summary}") + else: + lines.append(f"- Symbol `{title}` from `{source}`: {summary}") + symbol_lines += 1 + if symbol_lines >= 2: + break + + edge_map: dict[str, list[str]] = {} + for item in edge_items: + meta = item.get("metadata", {}) or {} + src_qname = str(meta.get("src_qname", "") or "").strip() + dst_ref = str(meta.get("dst_ref", "") or "").strip() + if not src_qname or not dst_ref: + continue + if self._is_test_path(str(item.get("source", "") or "")): + continue + edge_map.setdefault(src_qname, []) + if dst_ref not in edge_map[src_qname]: + edge_map[src_qname].append(dst_ref) + for src_qname, targets in list(edge_map.items())[:3]: + joined = ", ".join(targets[:4]) + if russian: + lines.append(f"- `{src_qname}` вызывает или использует: {joined}.") + else: + lines.append(f"- `{src_qname}` calls or uses: {joined}.") + + for item in source_items[:2]: + source = str(item.get("source", "") or "") + content = str(item.get("content", "") or "") + if self._is_test_path(source): + continue + if "management" in content.lower() or "control" in content.lower(): + snippet = " ".join(content.splitlines()[:4]).strip() + if russian: + lines.append(f"- В `{source}` есть прямое указание на управление через конфиг/API: `{snippet[:220]}`") + else: + lines.append(f"- `{source}` directly mentions config/API control: `{snippet[:220]}`") + + return "\n".join(lines) + + def _code_references(self, items: list[dict], *, russian: bool) -> str: + paths = [str(item.get("source", "") or "") for item in items if item.get("source") and not self._is_test_path(str(item.get("source", "") or ""))] + if not paths: + return "" + lines = ["### Где смотреть в проекте" if russian else "### Where to look in the project"] + for path, _count in Counter(paths).most_common(3): + lines.append(f"- `{path}`") + return "\n".join(lines) + + def _find_alias_symbol(self, items: list[dict]) -> dict | None: + for item in items: + meta = item.get("metadata", {}) or {} + payload = meta.get("lang_payload", {}) or {} + qname = str(meta.get("qname", "") or "") + if qname == "ConfigManager" and payload.get("import_alias"): + return item + return None + + def _is_test_path(self, path: str) -> bool: + lowered = path.lower() + return lowered.startswith("tests/") or "/tests/" in lowered or lowered.startswith("test_") or "/test_" in lowered + + def _is_control_symbol(self, title: str) -> bool: + lowered = title.lower() + return any(token in lowered for token in ("controlchannel", "controlchannelbridge", "on_start", "on_stop", "on_status")) + + def _management_summary( + self, + symbol_items: list[dict], + edge_items: list[dict], + source_items: list[dict], + *, + russian: bool, + ) -> list[str]: + qnames = {str((item.get("metadata", {}) or {}).get("qname", "") or ""): item for item in symbol_items if not self._is_test_path(str(item.get("source", "") or ""))} + source_texts = [str(item.get("content", "") or "") for item in source_items if not self._is_test_path(str(item.get("source", "") or ""))] + result: list[str] = [] + + if any("управление через api" in text.lower() or "section management" in text.lower() or "секция management" in text.lower() for text in source_texts): + result.append( + "- Для `ConfigManager` в коде предусмотрен отдельный интерфейс управления через API/конфиг: это прямо указано в публичной точке входа модуля." + if russian + else "- `ConfigManager` has a dedicated API/config-based management interface; this is stated in the module's public entrypoint." + ) + + has_control_channel = "ControlChannel" in qnames + has_bridge = "ControlChannelBridge" in qnames + if has_control_channel: + result.append( + "- Базовый контракт управления задает `ControlChannel`: он определяет команды `start` и `stop` для внешнего канала управления." + if russian + else "- The base management contract is `ControlChannel`, which defines external `start` and `stop` commands." + ) + if has_bridge: + result.append( + "- `ControlChannelBridge` связывает внешний канал управления с lifecycle-методами менеджера: `on_start`, `on_stop`, `on_status`." + if russian + else "- `ControlChannelBridge` maps the external control channel to manager lifecycle methods: `on_start`, `on_stop`, `on_status`." + ) + + edge_refs = [] + for item in edge_items: + if self._is_test_path(str(item.get("source", "") or "")): + continue + meta = item.get("metadata", {}) or {} + src = str(meta.get("src_qname", "") or "") + dst = str(meta.get("dst_ref", "") or "") + if src.startswith("ControlChannelBridge.") and dst in {"self._start_runtime", "self._stop_runtime", "self._get_status"}: + edge_refs.append((src, dst)) + if edge_refs: + mappings = ", ".join(f"{src} -> {dst}" for src, dst in edge_refs[:3]) + result.append( + f"- По связям в коде видно, что команды управления маршрутизируются так: {mappings}." + if russian + else f"- The code relationships show the management command routing: {mappings}." + ) + return result diff --git a/app/modules/agent/engine/orchestrator/step_registry.py b/app/modules/agent/engine/orchestrator/step_registry.py index 918809e..736c473 100644 --- a/app/modules/agent/engine/orchestrator/step_registry.py +++ b/app/modules/agent/engine/orchestrator/step_registry.py @@ -24,8 +24,6 @@ class StepRegistry: "finalize_graph_output": self._finalize_graph_output, "collect_sources": explain.collect_sources, "extract_logic": explain.extract_logic, - "build_sequence": explain.build_sequence, - "build_use_cases": explain.build_use_cases, "summarize": explain.summarize, "fetch_source_doc": review.fetch_source_doc, "normalize_document": review.normalize_document, diff --git a/app/modules/agent/engine/orchestrator/task_spec_builder.py b/app/modules/agent/engine/orchestrator/task_spec_builder.py index 1c85dda..858bea7 100644 --- a/app/modules/agent/engine/orchestrator/task_spec_builder.py +++ b/app/modules/agent/engine/orchestrator/task_spec_builder.py @@ -24,6 +24,7 @@ class TaskSpecBuilder: route: RoutingMeta, attachments: list[dict], files: list[dict], + rag_items: list[dict], rag_context: str, confluence_context: str, files_map: dict[str, dict], @@ -32,6 +33,7 @@ class TaskSpecBuilder: output_contract = self._output_contract(scenario) constraints = self._constraints_for(scenario) metadata = { + "rag_items": rag_items, "rag_context": rag_context, "confluence_context": confluence_context, "files_map": files_map, @@ -124,14 +126,7 @@ class TaskSpecBuilder: def _output_contract(self, scenario: Scenario) -> OutputContract: if scenario == Scenario.EXPLAIN_PART: - return OutputContract( - result_type="answer", - sections=[ - OutputSection(name="sequence_diagram", format="mermaid"), - OutputSection(name="use_cases", format="markdown"), - OutputSection(name="summary", format="markdown"), - ], - ) + return OutputContract(result_type="answer", sections=[OutputSection(name="summary", format="markdown")]) if scenario == Scenario.ANALYTICS_REVIEW: return OutputContract( result_type="review_report", diff --git a/app/modules/agent/engine/orchestrator/template_registry.py b/app/modules/agent/engine/orchestrator/template_registry.py index d1b6ff1..b6554a1 100644 --- a/app/modules/agent/engine/orchestrator/template_registry.py +++ b/app/modules/agent/engine/orchestrator/template_registry.py @@ -43,9 +43,7 @@ class ScenarioTemplateRegistry: steps = [ self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]), self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]), - self._step("build_sequence", "Build sequence", "build_sequence", depends_on=["extract_logic"], outputs=[self._out("sequence_diagram", ArtifactType.TEXT)]), - self._step("build_use_cases", "Build use cases", "build_use_cases", depends_on=["extract_logic"], outputs=[self._out("use_cases", ArtifactType.TEXT)]), - self._step("summarize", "Summarize", "summarize", depends_on=["build_sequence", "build_use_cases"], outputs=[self._out("final_answer", ArtifactType.TEXT)]), + self._step("summarize", "Summarize", "summarize", depends_on=["extract_logic"], outputs=[self._out("final_answer", ArtifactType.TEXT)]), ] return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")]) diff --git a/app/modules/agent/service.py b/app/modules/agent/service.py index 9467b56..0b3a114 100644 --- a/app/modules/agent/service.py +++ b/app/modules/agent/service.py @@ -117,6 +117,7 @@ class GraphAgentRuntime: route=route_meta, attachments=attachments, files=files, + rag_items=rag_ctx, rag_context=self._format_rag(rag_ctx), confluence_context=self._format_confluence(conf_pages), files_map=files_map, @@ -368,7 +369,33 @@ class GraphAgentRuntime: return pages def _format_rag(self, items: list[dict]) -> str: - return "\n".join(str(x.get("content", "")) for x in items) + blocks: list[str] = [] + for item in items: + source = str(item.get("source", "") or item.get("path", "") or "") + layer = str(item.get("layer", "") or "").strip() + title = str(item.get("title", "") or "").strip() + metadata = item.get("metadata", {}) or {} + lines = [] + if source: + lines.append(f"Source: {source}") + if layer: + lines.append(f"Layer: {layer}") + if title: + lines.append(f"Title: {title}") + if metadata: + hints = [] + for key in ("module_id", "qname", "predicate", "entry_type", "framework", "section_path"): + value = metadata.get(key) + if value: + hints.append(f"{key}={value}") + if hints: + lines.append("Meta: " + ", ".join(hints)) + content = str(item.get("content", "")).strip() + if content: + lines.append(content) + if lines: + blocks.append("\n".join(lines)) + return "\n\n".join(blocks) def _format_confluence(self, pages: list[dict]) -> str: return "\n".join(str(x.get("content_markdown", "")) for x in pages) diff --git a/app/modules/application.py b/app/modules/application.py index 8f4cbba..caab8f3 100644 --- a/app/modules/application.py +++ b/app/modules/application.py @@ -3,7 +3,7 @@ from app.modules.agent.repository import AgentRepository from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository from app.modules.chat.repository import ChatRepository from app.modules.chat.module import ChatModule -from app.modules.rag_session.repository import RagRepository +from app.modules.rag.persistence.repository import RagRepository from app.modules.rag_session.module import RagModule from app.modules.rag_repo.module import RagRepoModule from app.modules.shared.bootstrap import bootstrap_database diff --git a/app/modules/rag/README.md b/app/modules/rag/README.md new file mode 100644 index 0000000..a30afbc --- /dev/null +++ b/app/modules/rag/README.md @@ -0,0 +1,300 @@ +# Модуль rag + +## 1. Функции модуля +- Единое ядро RAG для индексации и retrieval по документации и коду проекта. +- Поддержка двух семейств индексации: `DOCS` и `CODE`, с разными слоями и разными pipeline. +- Хранение `rag_session`, index-jobs, многослойных документов, cache-слоев и retrieval-запросов. +- Поддержка индексации snapshot и changes с переиспользованием cache по `blob_sha`. +- Предоставление контекста для agent/chat, где `DOCS` используется по умолчанию, а `CODE` включается для явных вопросов по реализации. + +## 2. Диаграмма классов и взаимосвязей +```mermaid +classDiagram + class RagService + class RagRepository + class RagSchemaRepository + class RagDocumentUpserter + class DocsIndexingPipeline + class CodeIndexingPipeline + class RagQueryRouter + class GigaChatEmbedder + + RagService --> RagRepository + RagService --> DocsIndexingPipeline + RagService --> CodeIndexingPipeline + RagService --> RagQueryRouter + RagService --> GigaChatEmbedder + RagRepository --> RagSchemaRepository + RagService --> RagDocumentUpserter +``` + +## 3. Описание классов +- `RagService`: основной application-service модуля. + Методы: `index_snapshot` — индексирует полный набор файлов; `index_changes` — применяет инкрементальные изменения; `retrieve` — возвращает релевантный контекст из `DOCS` или `CODE`. +- `RagRepository`: фасад persistence-слоя RAG. + Методы: `ensure_tables` — создает/обновляет схему; `upsert_session/get_session/session_exists` — операции по `rag_session`; `create_job/update_job/get_job` — операции по index jobs; `replace_documents/apply_document_changes` — операции по документам; `get_cached_documents/cache_documents` — работа с cache; `retrieve/fallback_chunks` — retrieval. +- `RagSchemaRepository`: управление схемой БД для RAG. + Методы: `ensure_tables` — создает таблицы и индексы; `_ensure_columns` — добавляет новые поля; `_ensure_indexes` — поддерживает индексы для retrieval и фильтрации. +- `RagDocumentUpserter`: батчевый writer многослойных `RagDocument`. + Методы: `replace` — полностью заменяет документы сессии; `apply_changes` — применяет upsert/delete по измененным путям. +- `DocsIndexingPipeline`: pipeline индексации документации. + Методы: `supports` — определяет, относится ли файл к docs; `index_file` — строит документы слоев `D1-D4` для одного файла. +- `CodeIndexingPipeline`: pipeline индексации Python-кода. + Методы: `supports` — определяет, относится ли файл к code; `index_file` — строит документы слоев `C0-C3` для одного файла. +- `RagQueryRouter`: выбирает retrieval mode и активные слои. + Методы: `resolve_mode` — определяет `docs` или `code`; `layers_for_mode` — возвращает набор слоев для retrieval. +- `GigaChatEmbedder`: адаптер embeddings-модели. + Методы: `embed` — возвращает embeddings для списка текстов. + +## 4. Сиквенс-диаграммы API и выполнения + +### Индексация snapshot через текущий `rag_session` facade +Назначение: создать/обновить `rag_session` и построить многослойный индекс по переданным файлам проекта. +```mermaid +sequenceDiagram + participant Router as RagModule.APIRouter + participant Sessions as RagSessionStore + participant Indexing as IndexingOrchestrator + participant Rag as RagService + participant Docs as DocsIndexingPipeline + participant Code as CodeIndexingPipeline + participant Repo as RagRepository + + Router->>Sessions: create(project_id) + Sessions-->>Router: rag_session_id + Router->>Indexing: enqueue_snapshot(rag_session_id, files) + Indexing->>Rag: index_snapshot(rag_session_id, files) + loop for each file + Rag->>Docs: supports/index_file + Rag->>Code: supports/index_file + Rag->>Repo: cache_documents(...) + end + Rag->>Repo: replace_documents(...) + Indexing-->>Router: index_job_id,status +``` + +### Retrieval для agent/chat +Назначение: вернуть релевантный контекст из нужного семейства слоев. +```mermaid +sequenceDiagram + participant Agent as GraphAgentRuntime + participant Rag as RagService + participant Router as RagQueryRouter + participant Repo as RagRepository + + Agent->>Rag: retrieve(rag_session_id, query) + Rag->>Router: resolve_mode(query) + Router-->>Rag: docs|code + layers + Rag->>Repo: retrieve(query_embedding, query_text, layers) + Repo-->>Rag: ranked items + Rag-->>Agent: items +``` + +## 5. Слои, фиксируемые в RAG + +### 5.1. Слои DOCS + +#### `D1_MODULE_CATALOG` +Назначение: каталог модулей документации и граф связей между ними. + +Основные атрибуты: +- `module_id` +- `type` +- `domain` +- `title` +- `status` +- `version` +- `tags` +- `owners` +- `links` +- `calls_api` +- `called_by` +- `uses_logic` +- `used_by` +- `reads_db` +- `writes_db` +- `integrates_with` +- `emits_events` +- `consumes_events` +- `source_path` +- `summary_text` + +#### `D2_FACT_INDEX` +Назначение: атомарные факты `subject-predicate-object` с evidence. + +Основные атрибуты: +- `fact_id` +- `subject_id` +- `predicate` +- `object` +- `object_ref` +- `source_path` +- `anchor` +- `line_start` +- `line_end` +- `confidence` +- `tags` + +#### `D3_SECTION_INDEX` +Назначение: семантические секции документации, нарезанные по заголовкам. + +Основные атрибуты: +- `chunk_id` +- `module_id` +- `section_path` +- `section_title` +- `content` +- `source_path` +- `order` +- `tags` +- `domain` +- `type` +- `embedding` + +#### `D4_POLICY_INDEX` +Назначение: глобальные правила и конвенции проекта. + +Основные атрибуты: +- `policy_id` +- `applies_to` +- `rules` +- `default_behaviors` +- `source_path` + +### 5.2. Слои CODE + +#### `C0_SOURCE_CHUNKS` +Назначение: сырой код как источник истины для цитирования и evidence. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `path` +- `span` +- `title` +- `text` +- `module_or_unit` +- `chunk_type` +- `symbol_id` +- `hash` + +#### `C1_SYMBOL_CATALOG` +Назначение: каталог символов кода и их деклараций. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `symbol_id` +- `qname` +- `kind` +- `decl.path` +- `decl.start_line` +- `decl.end_line` +- `text` +- `visibility` +- `signature` +- `decorators_or_annotations` +- `docstring_or_javadoc` +- `parent_symbol_id` +- `package_or_module` +- `is_entry_candidate` +- `lang_payload` + +#### `C2_DEPENDENCY_GRAPH` +Назначение: связи между сущностями кода. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `edge_id` +- `edge_type` +- `src_symbol_id` +- `dst_symbol_id` +- `dst_ref` +- `evidence.path` +- `evidence.start_line` +- `evidence.end_line` +- `text` +- `resolution` +- `callsite_kind` +- `lang_payload` + +#### `C3_ENTRYPOINTS` +Назначение: точки входа приложения и их обработчики. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `entry_id` +- `entry_type` +- `framework` +- `route_or_command` +- `handler_symbol_id` +- `evidence.path` +- `evidence.start_line` +- `evidence.end_line` +- `text` +- `http.methods` +- `http.auth` +- `request_model` +- `response_model` +- `cli.args_schema` +- `task.queue` +- `task.cron` +- `tags` +- `lang_payload` + +#### `C4_PUBLIC_API` +Назначение: публичная поверхность API/экспортируемых символов. + +Основные атрибуты: +- `api_id` +- `symbol_id` +- `stability` +- `source_of_truth` +- `versioning_tags` +- `lang_payload` + +#### `C5_BEHAVIOR_SUMMARIES` +Назначение: поведенческие summary с обязательными evidence links. + +Основные атрибуты: +- `target_type` +- `target_id` +- `text` +- `claims` +- `evidence_links` +- `confidence` +- `generated_by` +- `generated_at` + +#### `C6_RUNTIME_TRACES` +Назначение: runtime/trace слой для связи кода и реального исполнения. + +Основные атрибуты: +- `env` +- `trace_id` +- `span_id` +- `symbol_id` +- `entry_id` +- `text` +- `timings` +- `service` +- `host` +- `labels` + +## 6. Правила retrieval +- По умолчанию retrieval идет в `DOCS`. +- `CODE` используется только для явных вопросов по реализации, устройству кода, endpoint'ам, handler'ам и документации “из кода”. +- Для `DOCS` приоритет слоев: `D1 -> D2 -> D3 -> D4`. +- Для `CODE` приоритет слоев: `C3 -> C1 -> C2 -> C0`. + +## 7. Текущий статус реализации +- В первой итерации реализованы `DOCS D1-D4`. +- В первой итерации реализованы `CODE C0-C3`. +- `C4-C6` зафиксированы в контракте и зарезервированы под следующие этапы. +- Текущие `rag_session` и `rag_repo` работают как facade/adapter поверх нового пакета `rag`. diff --git a/app/modules/rag/__init__.py b/app/modules/rag/__init__.py new file mode 100644 index 0000000..153b59a --- /dev/null +++ b/app/modules/rag/__init__.py @@ -0,0 +1,25 @@ +from app.modules.rag.contracts import ( + DocKind, + EvidenceLink, + EvidenceType, + RagDocument, + RagLayer, + RagSource, + RagSpan, + RetrievalItem, + RetrievalMode, + RetrievalQuery, +) + +__all__ = [ + "DocKind", + "EvidenceLink", + "EvidenceType", + "RagDocument", + "RagLayer", + "RagSource", + "RagSpan", + "RetrievalItem", + "RetrievalMode", + "RetrievalQuery", +] diff --git a/app/modules/rag/__pycache__/__init__.cpython-312.pyc b/app/modules/rag/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..3f08620 Binary files /dev/null and b/app/modules/rag/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/__init__.py b/app/modules/rag/contracts/__init__.py new file mode 100644 index 0000000..cdb52d6 --- /dev/null +++ b/app/modules/rag/contracts/__init__.py @@ -0,0 +1,17 @@ +from app.modules.rag.contracts.documents import RagDocument, RagSource, RagSpan +from app.modules.rag.contracts.enums import DocKind, EvidenceType, RagLayer, RetrievalMode +from app.modules.rag.contracts.evidence import EvidenceLink +from app.modules.rag.contracts.retrieval import RetrievalItem, RetrievalQuery + +__all__ = [ + "DocKind", + "EvidenceLink", + "EvidenceType", + "RagDocument", + "RagLayer", + "RagSource", + "RagSpan", + "RetrievalItem", + "RetrievalMode", + "RetrievalQuery", +] diff --git a/app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..4620de5 Binary files /dev/null and b/app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc new file mode 100644 index 0000000..8b54baa Binary files /dev/null and b/app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc new file mode 100644 index 0000000..f63fb22 Binary files /dev/null and b/app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc new file mode 100644 index 0000000..9cc7013 Binary files /dev/null and b/app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc new file mode 100644 index 0000000..6f9f2a5 Binary files /dev/null and b/app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc differ diff --git a/app/modules/rag/contracts/documents.py b/app/modules/rag/contracts/documents.py new file mode 100644 index 0000000..e53fa4f --- /dev/null +++ b/app/modules/rag/contracts/documents.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from hashlib import sha256 + +from app.modules.rag.contracts.evidence import EvidenceLink + + +@dataclass(slots=True) +class RagSource: + repo_id: str + commit_sha: str | None + path: str + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass(slots=True) +class RagSpan: + start_line: int | None = None + end_line: int | None = None + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass(slots=True) +class RagDocument: + layer: str + source: RagSource + title: str + text: str + metadata: dict = field(default_factory=dict) + links: list[EvidenceLink] = field(default_factory=list) + span: RagSpan | None = None + doc_id: str | None = None + lang: str | None = None + embedding: list[float] | None = None + + def ensure_doc_id(self) -> str: + if self.doc_id: + return self.doc_id + span_key = "" + if self.span is not None: + span_key = f":{self.span.start_line}:{self.span.end_line}" + raw = "|".join( + [ + self.layer, + self.lang or "", + self.source.repo_id, + self.source.commit_sha or "", + self.source.path, + self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""), + self.title, + span_key, + ] + ) + self.doc_id = sha256(raw.encode("utf-8")).hexdigest() + return self.doc_id + + def to_record(self) -> dict: + return { + "doc_id": self.ensure_doc_id(), + "layer": self.layer, + "lang": self.lang, + "repo_id": self.source.repo_id, + "commit_sha": self.source.commit_sha, + "path": self.source.path, + "title": self.title, + "text": self.text, + "metadata": dict(self.metadata), + "links": [link.to_dict() for link in self.links], + "span_start": self.span.start_line if self.span else None, + "span_end": self.span.end_line if self.span else None, + "embedding": self.embedding or [], + } diff --git a/app/modules/rag/contracts/enums.py b/app/modules/rag/contracts/enums.py new file mode 100644 index 0000000..72502c6 --- /dev/null +++ b/app/modules/rag/contracts/enums.py @@ -0,0 +1,35 @@ +from __future__ import annotations + + +class RagLayer: + DOCS_MODULE_CATALOG = "D1_MODULE_CATALOG" + DOCS_FACT_INDEX = "D2_FACT_INDEX" + DOCS_SECTION_INDEX = "D3_SECTION_INDEX" + DOCS_POLICY_INDEX = "D4_POLICY_INDEX" + CODE_SOURCE_CHUNKS = "C0_SOURCE_CHUNKS" + CODE_SYMBOL_CATALOG = "C1_SYMBOL_CATALOG" + CODE_DEPENDENCY_GRAPH = "C2_DEPENDENCY_GRAPH" + CODE_ENTRYPOINTS = "C3_ENTRYPOINTS" + CODE_PUBLIC_API = "C4_PUBLIC_API" + CODE_BEHAVIOR_SUMMARIES = "C5_BEHAVIOR_SUMMARIES" + CODE_RUNTIME_TRACES = "C6_RUNTIME_TRACES" + + +class RetrievalMode: + DOCS = "docs" + CODE = "code" + + +class DocKind: + SPEC = "spec" + RUNBOOK = "runbook" + README = "readme" + MISC = "misc" + + +class EvidenceType: + CODE_SPAN = "code_span" + SYMBOL = "symbol" + EDGE = "edge" + DOC_SECTION = "doc_section" + DOC_FACT = "doc_fact" diff --git a/app/modules/rag/contracts/evidence.py b/app/modules/rag/contracts/evidence.py new file mode 100644 index 0000000..0b1e4d0 --- /dev/null +++ b/app/modules/rag/contracts/evidence.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass + + +@dataclass(slots=True) +class EvidenceLink: + type: str + target_id: str + path: str | None = None + start_line: int | None = None + end_line: int | None = None + note: str | None = None + + def to_dict(self) -> dict: + return asdict(self) diff --git a/app/modules/rag/contracts/retrieval.py b/app/modules/rag/contracts/retrieval.py new file mode 100644 index 0000000..97c2fe6 --- /dev/null +++ b/app/modules/rag/contracts/retrieval.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class RetrievalQuery: + text: str + mode: str + limit: int = 5 + layers: list[str] = field(default_factory=list) + path_prefixes: list[str] = field(default_factory=list) + doc_kind: str | None = None + + +@dataclass(slots=True) +class RetrievalItem: + content: str + path: str + layer: str + title: str + score: float | None = None + metadata: dict | None = None diff --git a/app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc b/app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc new file mode 100644 index 0000000..54cd85a Binary files /dev/null and b/app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc b/app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc new file mode 100644 index 0000000..00bf0df Binary files /dev/null and b/app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/code_text/__pycache__/chunker.cpython-312.pyc b/app/modules/rag/indexing/code/code_text/__pycache__/chunker.cpython-312.pyc new file mode 100644 index 0000000..980829c Binary files /dev/null and b/app/modules/rag/indexing/code/code_text/__pycache__/chunker.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000..7893715 Binary files /dev/null and b/app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/code_text/chunker.py b/app/modules/rag/indexing/code/code_text/chunker.py new file mode 100644 index 0000000..4c6112d --- /dev/null +++ b/app/modules/rag/indexing/code/code_text/chunker.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass + + +@dataclass(slots=True) +class CodeChunk: + title: str + text: str + start_line: int + end_line: int + chunk_type: str + + +class CodeTextChunker: + def chunk(self, path: str, text: str) -> list[CodeChunk]: + try: + tree = ast.parse(text) + except SyntaxError: + return self._window_chunks(path, text) + chunks: list[CodeChunk] = [] + lines = text.splitlines() + for node in tree.body: + if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + continue + start = int(getattr(node, "lineno", 1)) + end = int(getattr(node, "end_lineno", start)) + body = "\n".join(lines[start - 1 : end]).strip() + if not body: + continue + chunks.append( + CodeChunk( + title=f"{path}:{getattr(node, 'name', 'block')}", + text=body, + start_line=start, + end_line=end, + chunk_type="symbol_block", + ) + ) + return chunks or self._window_chunks(path, text) + + def _window_chunks(self, path: str, text: str) -> list[CodeChunk]: + lines = text.splitlines() + chunks: list[CodeChunk] = [] + size = 80 + overlap = 15 + start = 0 + while start < len(lines): + end = min(len(lines), start + size) + body = "\n".join(lines[start:end]).strip() + if body: + chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window")) + if end >= len(lines): + break + start = max(0, end - overlap) + return chunks diff --git a/app/modules/rag/indexing/code/code_text/document_builder.py b/app/modules/rag/indexing/code/code_text/document_builder.py new file mode 100644 index 0000000..c42f37e --- /dev/null +++ b/app/modules/rag/indexing/code/code_text/document_builder.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.code_text.chunker import CodeChunk + + +class CodeTextDocumentBuilder: + def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument: + return RagDocument( + layer=RagLayer.CODE_SOURCE_CHUNKS, + lang="python", + source=source, + title=chunk.title, + text=chunk.text, + span=RagSpan(chunk.start_line, chunk.end_line), + metadata={ + "chunk_index": chunk_index, + "chunk_type": chunk.chunk_type, + "module_or_unit": source.path.replace("/", ".").removesuffix(".py"), + "artifact_type": "CODE", + }, + ) diff --git a/app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000..4e8a6b1 Binary files /dev/null and b/app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/edges/__pycache__/extractor.cpython-312.pyc b/app/modules/rag/indexing/code/edges/__pycache__/extractor.cpython-312.pyc new file mode 100644 index 0000000..d0a3bc7 Binary files /dev/null and b/app/modules/rag/indexing/code/edges/__pycache__/extractor.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/edges/document_builder.py b/app/modules/rag/indexing/code/edges/document_builder.py new file mode 100644 index 0000000..cc6f784 --- /dev/null +++ b/app/modules/rag/indexing/code/edges/document_builder.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.edges.extractor import PyEdge + + +class EdgeDocumentBuilder: + def build(self, source: RagSource, edge: PyEdge) -> RagDocument: + dst = edge.dst_ref or edge.dst_symbol_id or "unknown" + return RagDocument( + layer=RagLayer.CODE_DEPENDENCY_GRAPH, + lang="python", + source=source, + title=f"{edge.src_qname}:{edge.edge_type}", + text=f"{edge.src_qname} {edge.edge_type} {dst}", + span=RagSpan(edge.start_line, edge.end_line), + metadata={ + "edge_id": edge.edge_id, + "edge_type": edge.edge_type, + "src_symbol_id": edge.src_symbol_id, + "src_qname": edge.src_qname, + "dst_symbol_id": edge.dst_symbol_id, + "dst_ref": edge.dst_ref, + "resolution": edge.resolution, + "lang_payload": edge.metadata, + "artifact_type": "CODE", + }, + links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)], + ) diff --git a/app/modules/rag/indexing/code/edges/extractor.py b/app/modules/rag/indexing/code/edges/extractor.py new file mode 100644 index 0000000..1eaee75 --- /dev/null +++ b/app/modules/rag/indexing/code/edges/extractor.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from hashlib import sha256 + + +@dataclass(slots=True) +class PyEdge: + edge_id: str + edge_type: str + src_symbol_id: str + src_qname: str + dst_symbol_id: str | None + dst_ref: str | None + path: str + start_line: int + end_line: int + resolution: str = "partial" + metadata: dict = field(default_factory=dict) + + +class EdgeExtractor: + def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]: + if ast_tree is None: + return [] + qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols} + visitor = _EdgeVisitor(path, qname_map) + visitor.visit(ast_tree) + return visitor.edges + + +class _EdgeVisitor(ast.NodeVisitor): + def __init__(self, path: str, qname_map: dict[str, str]) -> None: + self._path = path + self._qname_map = qname_map + self._scope: list[str] = [] + self.edges: list[PyEdge] = [] + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + current = self._enter(node.name) + for base in node.bases: + self._add_edge("inherits", current, self._name(base), base) + self.generic_visit(node) + self._scope.pop() + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._visit_function(node) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._visit_function(node) + + def visit_Import(self, node: ast.Import) -> None: + current = self._current_qname() + if not current: + return + for item in node.names: + self._add_edge("imports", current, item.name, node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + current = self._current_qname() + if not current: + return + module = node.module or "" + for item in node.names: + self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node) + + def _visit_function(self, node) -> None: + current = self._enter(node.name) + for inner in ast.walk(node): + if isinstance(inner, ast.Call): + self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"}) + self.generic_visit(node) + self._scope.pop() + + def _enter(self, name: str) -> str: + self._scope.append(name) + return self._current_qname() or name + + def _current_qname(self) -> str | None: + if not self._scope: + return None + return ".".join(self._scope) + + def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None: + if not dst_ref: + return + src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest()) + dst_symbol_id = self._qname_map.get(dst_ref) + edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest() + self.edges.append( + PyEdge( + edge_id=edge_id, + edge_type=edge_type, + src_symbol_id=src_symbol_id, + src_qname=src_qname, + dst_symbol_id=dst_symbol_id, + dst_ref=dst_ref, + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + resolution="resolved" if dst_symbol_id else "partial", + metadata=extra or {}, + ) + ) + + def _name(self, node) -> str: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return f"{self._name(node.value)}.{node.attr}" + if isinstance(node, ast.Call): + return self._name(node.func) + return "" diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000..4693571 Binary files /dev/null and b/app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc new file mode 100644 index 0000000..91effe9 Binary files /dev/null and b/app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc new file mode 100644 index 0000000..229a71e Binary files /dev/null and b/app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc new file mode 100644 index 0000000..5e580fa Binary files /dev/null and b/app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/typer_click_detector.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/typer_click_detector.cpython-312.pyc new file mode 100644 index 0000000..97001b7 Binary files /dev/null and b/app/modules/rag/indexing/code/entrypoints/__pycache__/typer_click_detector.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/entrypoints/document_builder.py b/app/modules/rag/indexing/code/entrypoints/document_builder.py new file mode 100644 index 0000000..0315cfe --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/document_builder.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class EntrypointDocumentBuilder: + def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument: + return RagDocument( + layer=RagLayer.CODE_ENTRYPOINTS, + lang="python", + source=source, + title=entrypoint.route_or_command, + text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}", + span=RagSpan(entrypoint.start_line, entrypoint.end_line), + metadata={ + "entry_id": entrypoint.entry_id, + "entry_type": entrypoint.entry_type, + "framework": entrypoint.framework, + "route_or_command": entrypoint.route_or_command, + "handler_symbol_id": entrypoint.handler_symbol_id, + "lang_payload": entrypoint.metadata, + "artifact_type": "CODE", + }, + links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)], + ) diff --git a/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py b/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py new file mode 100644 index 0000000..d048f9a --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class FastApiEntrypointDetector: + _METHODS = {"get", "post", "put", "patch", "delete"} + + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + decorators = symbol.decorators or [] + for decorator in decorators: + name = decorator.lower() + tail = name.split(".")[-1] + if tail not in self._METHODS and ".route" not in name: + continue + route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator + items.append( + Entrypoint( + entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="http", + framework="fastapi", + route_or_command=route, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + metadata={"methods": [tail.upper()] if tail in self._METHODS else []}, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/flask_detector.py b/app/modules/rag/indexing/code/entrypoints/flask_detector.py new file mode 100644 index 0000000..25c0c27 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/flask_detector.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class FlaskEntrypointDetector: + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + for decorator in symbol.decorators or []: + lowered = decorator.lower() + if ".route" not in lowered: + continue + items.append( + Entrypoint( + entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="http", + framework="flask", + route_or_command=decorator, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/registry.py b/app/modules/rag/indexing/code/entrypoints/registry.py new file mode 100644 index 0000000..d49f6f8 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/registry.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class Entrypoint: + entry_id: str + entry_type: str + framework: str + route_or_command: str + handler_symbol_id: str + path: str + start_line: int + end_line: int + metadata: dict = field(default_factory=dict) + + +class EntrypointDetectorRegistry: + def __init__(self, detectors: list) -> None: + self._detectors = detectors + + def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for detector in self._detectors: + items.extend(detector.detect(path=path, symbols=symbols)) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py b/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py new file mode 100644 index 0000000..139f948 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class TyperClickEntrypointDetector: + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + for decorator in symbol.decorators or []: + lowered = decorator.lower() + if ".command" not in lowered and ".callback" not in lowered: + continue + framework = "typer" if "typer" in lowered else "click" + items.append( + Entrypoint( + entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="cli", + framework=framework, + route_or_command=decorator, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/file_filter.py b/app/modules/rag/indexing/code/file_filter.py new file mode 100644 index 0000000..3c497e6 --- /dev/null +++ b/app/modules/rag/indexing/code/file_filter.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + + +class PythonFileFilter: + _EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"} + + def should_index(self, path: str) -> bool: + candidate = PurePosixPath(path) + if candidate.suffix.lower() != ".py": + return False + return not any(part in self._EXCLUDE_PARTS for part in candidate.parts) diff --git a/app/modules/rag/indexing/code/pipeline.py b/app/modules/rag/indexing/code/pipeline.py new file mode 100644 index 0000000..fe91b01 --- /dev/null +++ b/app/modules/rag/indexing/code/pipeline.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagSource +from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker +from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder +from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder +from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor +from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder +from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector +from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector +from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry +from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector +from app.modules.rag.indexing.code.file_filter import PythonFileFilter +from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser +from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder +from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor + + +class CodeIndexingPipeline: + def __init__(self) -> None: + self._filter = PythonFileFilter() + self._chunker = CodeTextChunker() + self._code_builder = CodeTextDocumentBuilder() + self._parser = PythonAstParser() + self._symbols = SymbolExtractor() + self._symbol_builder = SymbolDocumentBuilder() + self._edges = EdgeExtractor() + self._edge_builder = EdgeDocumentBuilder() + self._entrypoints = EntrypointDetectorRegistry( + [FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()] + ) + self._entrypoint_builder = EntrypointDocumentBuilder() + + def supports(self, path: str) -> bool: + return self._filter.should_index(path) + + def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: + source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) + docs: list[RagDocument] = [] + code_chunks = self._chunker.chunk(path, content) + for index, chunk in enumerate(code_chunks): + docs.append(self._code_builder.build(source, chunk, chunk_index=index)) + tree = self._parser.parse_module(content) + symbols = self._symbols.extract(path, content, tree) + for symbol in symbols: + docs.append(self._symbol_builder.build(source, symbol)) + edges = self._edges.extract(path, tree, symbols) + for edge in edges: + docs.append(self._edge_builder.build(source, edge)) + for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols): + docs.append(self._entrypoint_builder.build(source, entrypoint)) + return docs diff --git a/app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc b/app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc new file mode 100644 index 0000000..4b0875a Binary files /dev/null and b/app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000..d32cf32 Binary files /dev/null and b/app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/symbols/__pycache__/extractor.cpython-312.pyc b/app/modules/rag/indexing/code/symbols/__pycache__/extractor.cpython-312.pyc new file mode 100644 index 0000000..a70e73b Binary files /dev/null and b/app/modules/rag/indexing/code/symbols/__pycache__/extractor.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/code/symbols/ast_parser.py b/app/modules/rag/indexing/code/symbols/ast_parser.py new file mode 100644 index 0000000..4f481c5 --- /dev/null +++ b/app/modules/rag/indexing/code/symbols/ast_parser.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +import ast + + +class PythonAstParser: + def parse_module(self, text: str) -> ast.AST | None: + try: + return ast.parse(text) + except SyntaxError: + return None diff --git a/app/modules/rag/indexing/code/symbols/document_builder.py b/app/modules/rag/indexing/code/symbols/document_builder.py new file mode 100644 index 0000000..22085cc --- /dev/null +++ b/app/modules/rag/indexing/code/symbols/document_builder.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.symbols.extractor import PySymbol + + +class SymbolDocumentBuilder: + def build(self, source: RagSource, symbol: PySymbol) -> RagDocument: + body = [f"{symbol.kind} {symbol.qname}", symbol.signature] + if symbol.docstring: + body.append(symbol.docstring.strip()) + return RagDocument( + layer=RagLayer.CODE_SYMBOL_CATALOG, + lang="python", + source=source, + title=symbol.qname, + text="\n".join(part for part in body if part), + span=RagSpan(symbol.start_line, symbol.end_line), + metadata={ + "symbol_id": symbol.symbol_id, + "qname": symbol.qname, + "kind": symbol.kind, + "signature": symbol.signature, + "decorators_or_annotations": symbol.decorators, + "docstring_or_javadoc": symbol.docstring, + "parent_symbol_id": symbol.parent_symbol_id, + "package_or_module": source.path.replace("/", ".").removesuffix(".py"), + "is_entry_candidate": bool(symbol.decorators), + "lang_payload": symbol.lang_payload, + "artifact_type": "CODE", + }, + ) diff --git a/app/modules/rag/indexing/code/symbols/extractor.py b/app/modules/rag/indexing/code/symbols/extractor.py new file mode 100644 index 0000000..8142699 --- /dev/null +++ b/app/modules/rag/indexing/code/symbols/extractor.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from hashlib import sha256 + + +@dataclass(slots=True) +class PySymbol: + symbol_id: str + qname: str + kind: str + path: str + start_line: int + end_line: int + signature: str + decorators: list[str] = field(default_factory=list) + docstring: str | None = None + parent_symbol_id: str | None = None + lang_payload: dict = field(default_factory=dict) + + +class SymbolExtractor: + def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]: + if ast_tree is None: + return [] + collector = _SymbolVisitor(path) + collector.visit(ast_tree) + return collector.symbols + + +class _SymbolVisitor(ast.NodeVisitor): + def __init__(self, path: str) -> None: + self._path = path + self._stack: list[tuple[str, str]] = [] + self.symbols: list[PySymbol] = [] + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if self._stack: + return + module = node.module or "" + for item in node.names: + local_name = item.asname or item.name + imported_name = f"{module}.{item.name}".strip(".") + self.symbols.append( + PySymbol( + symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(), + qname=local_name, + kind="const", + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=f"{local_name} = {imported_name}", + lang_payload={"imported_from": imported_name, "import_alias": True}, + ) + ) + self.generic_visit(node) + + def visit_Import(self, node: ast.Import) -> None: + if self._stack: + return + for item in node.names: + local_name = item.asname or item.name + self.symbols.append( + PySymbol( + symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(), + qname=local_name, + kind="const", + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=f"import {item.name}", + lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)}, + ) + ) + self.generic_visit(node) + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]}) + self.generic_visit(node) + self._stack.pop() + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._add_function(node, is_async=False) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._add_function(node, is_async=True) + + def _add_function(self, node, *, is_async: bool) -> None: + kind = "method" if self._stack and self._stack[-1][0] == "class" else "function" + self._add_symbol(node, kind, {"async": is_async}) + self.generic_visit(node) + self._stack.pop() + + def _add_symbol(self, node, kind: str, lang_payload: dict) -> None: + names = [name for _, name in self._stack] + [node.name] + qname = ".".join(names) + symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest() + signature = self._signature(node) + symbol = PySymbol( + symbol_id=symbol_id, + qname=qname, + kind=kind, + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=signature, + decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])], + docstring=ast.get_docstring(node), + parent_symbol_id=self._stack[-1][1] if self._stack else None, + lang_payload=lang_payload, + ) + self.symbols.append(symbol) + self._stack.append((kind, qname)) + + def _signature(self, node) -> str: + if isinstance(node, ast.ClassDef): + bases = ", ".join(self._expr_name(base) for base in node.bases) + return f"{node.name}({bases})" if bases else node.name + args = [arg.arg for arg in getattr(node.args, "args", [])] + return f"{node.name}({', '.join(args)})" + + def _expr_name(self, node) -> str: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return f"{self._expr_name(node.value)}.{node.attr}" + if isinstance(node, ast.Call): + return self._expr_name(node.func) + return ast.dump(node, include_attributes=False) diff --git a/app/modules/rag/indexing/common/document_upserter.py b/app/modules/rag/indexing/common/document_upserter.py new file mode 100644 index 0000000..7bf2dba --- /dev/null +++ b/app/modules/rag/indexing/common/document_upserter.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.persistence.repository import RagRepository + + +class RagDocumentUpserter: + def __init__(self, repository: RagRepository) -> None: + self._repository = repository + + def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None: + self._repository.replace_documents(rag_session_id, docs) + + def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None: + self._repository.apply_document_changes(rag_session_id, delete_paths, docs) diff --git a/app/modules/rag/indexing/common/report.py b/app/modules/rag/indexing/common/report.py new file mode 100644 index 0000000..53da748 --- /dev/null +++ b/app/modules/rag/indexing/common/report.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class IndexReport: + indexed_files: int = 0 + failed_files: int = 0 + cache_hit_files: int = 0 + cache_miss_files: int = 0 + documents: int = 0 + warnings: list[str] = field(default_factory=list) + + def as_tuple(self) -> tuple[int, int, int, int]: + return ( + self.indexed_files, + self.failed_files, + self.cache_hit_files, + self.cache_miss_files, + ) diff --git a/app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc new file mode 100644 index 0000000..2669216 Binary files /dev/null and b/app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000..3ea98d1 Binary files /dev/null and b/app/modules/rag/indexing/docs/__pycache__/document_builder.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/__pycache__/file_filter.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/file_filter.cpython-312.pyc new file mode 100644 index 0000000..2e1d398 Binary files /dev/null and b/app/modules/rag/indexing/docs/__pycache__/file_filter.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc new file mode 100644 index 0000000..e3267d8 Binary files /dev/null and b/app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/chunkers/__pycache__/markdown_chunker.cpython-312.pyc b/app/modules/rag/indexing/docs/chunkers/__pycache__/markdown_chunker.cpython-312.pyc new file mode 100644 index 0000000..3932132 Binary files /dev/null and b/app/modules/rag/indexing/docs/chunkers/__pycache__/markdown_chunker.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/chunkers/__pycache__/text_chunker.cpython-312.pyc b/app/modules/rag/indexing/docs/chunkers/__pycache__/text_chunker.cpython-312.pyc new file mode 100644 index 0000000..9f200ab Binary files /dev/null and b/app/modules/rag/indexing/docs/chunkers/__pycache__/text_chunker.cpython-312.pyc differ diff --git a/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py new file mode 100644 index 0000000..2c218e5 --- /dev/null +++ b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker + + +@dataclass(slots=True) +class SectionChunk: + section_path: str + section_title: str + content: str + order: int + + +class MarkdownDocChunker: + def __init__(self, text_chunker: DocTextChunker | None = None) -> None: + self._fallback = text_chunker or DocTextChunker() + + def chunk(self, text: str) -> list[SectionChunk]: + lines = text.splitlines() + sections: list[SectionChunk] = [] + stack: list[tuple[int, str]] = [] + current_title = "Document" + current_lines: list[str] = [] + order = 0 + for line in lines: + heading = self._heading(line) + if heading is None: + current_lines.append(line) + continue + self._flush_section(sections, stack, current_title, current_lines, order) + order += 1 + level, title = heading + stack = [item for item in stack if item[0] < level] + stack.append((level, title)) + current_title = title + current_lines = [] + self._flush_section(sections, stack, current_title, current_lines, order) + if sections: + return sections + chunks = self._fallback.split(text) + return [ + SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index) + for index, chunk in enumerate(chunks) + ] + + def _flush_section( + self, + sections: list[SectionChunk], + stack: list[tuple[int, str]], + current_title: str, + current_lines: list[str], + order: int, + ) -> None: + content = "\n".join(current_lines).strip() + if not content: + return + titles = [title for _, title in stack] or [current_title] + sections.append( + SectionChunk( + section_path=" > ".join(titles), + section_title=titles[-1], + content=content, + order=order, + ) + ) + + def _heading(self, line: str) -> tuple[int, str] | None: + stripped = line.strip() + if not stripped.startswith("#"): + return None + level = len(stripped) - len(stripped.lstrip("#")) + title = stripped[level:].strip() + if not title: + return None + return level, title diff --git a/app/modules/rag/indexing/docs/chunkers/text_chunker.py b/app/modules/rag/indexing/docs/chunkers/text_chunker.py new file mode 100644 index 0000000..c0df728 --- /dev/null +++ b/app/modules/rag/indexing/docs/chunkers/text_chunker.py @@ -0,0 +1,21 @@ +from __future__ import annotations + + +class DocTextChunker: + def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None: + self._max_chars = max_chars + self._overlap_chars = overlap_chars + + def split(self, text: str) -> list[str]: + cleaned = text.strip() + if not cleaned: + return [] + chunks: list[str] = [] + start = 0 + while start < len(cleaned): + end = min(len(cleaned), start + self._max_chars) + chunks.append(cleaned[start:end].strip()) + if end >= len(cleaned): + break + start = max(0, end - self._overlap_chars) + return [chunk for chunk in chunks if chunk] diff --git a/app/modules/rag/indexing/docs/classifier.py b/app/modules/rag/indexing/docs/classifier.py new file mode 100644 index 0000000..b7129e0 --- /dev/null +++ b/app/modules/rag/indexing/docs/classifier.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + +from app.modules.rag.contracts import DocKind + + +class DocsClassifier: + def classify(self, path: str) -> str: + upper = PurePosixPath(path).name.upper() + lowered = path.lower() + if "runbook" in lowered or upper.startswith("RUNBOOK"): + return DocKind.RUNBOOK + if upper.startswith("README"): + return DocKind.README + if "spec" in lowered or "architecture" in lowered: + return DocKind.SPEC + return DocKind.MISC diff --git a/app/modules/rag/indexing/docs/document_builder.py b/app/modules/rag/indexing/docs/document_builder.py new file mode 100644 index 0000000..4420694 --- /dev/null +++ b/app/modules/rag/indexing/docs/document_builder.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource +from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk + + +class DocsDocumentBuilder: + def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None: + module_id = str(frontmatter.get("id") or "").strip() + module_type = str(frontmatter.get("type") or "").strip() + domain = str(frontmatter.get("domain") or "").strip() + if not module_id or not module_type or not domain: + return None + links = frontmatter.get("links") or {} + metadata = { + "module_id": module_id, + "type": module_type, + "domain": domain, + "status": frontmatter.get("status"), + "version": frontmatter.get("version"), + "tags": frontmatter.get("tags") or [], + "owners": frontmatter.get("owners") or [], + "links": links, + "source_path": source.path, + "summary_text": summary_text[:4000], + "doc_kind": doc_kind, + } + metadata.update({name: links.get(name, []) for name in ( + "calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db", + "integrates_with", "emits_events", "consumes_events", + )}) + return RagDocument( + layer=RagLayer.DOCS_MODULE_CATALOG, + source=source, + title=module_id, + text=summary_text[:4000] or module_id, + metadata=metadata, + ) + + def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument: + module_id = str(frontmatter.get("id") or source.path) + metadata = { + "module_id": module_id, + "type": frontmatter.get("type"), + "domain": frontmatter.get("domain"), + "tags": frontmatter.get("tags") or [], + "section_path": chunk.section_path, + "section_title": chunk.section_title, + "order": chunk.order, + "doc_kind": doc_kind, + "source_path": source.path, + "artifact_type": "DOCS", + } + return RagDocument( + layer=RagLayer.DOCS_SECTION_INDEX, + source=source, + title=f"{module_id}:{chunk.section_title}", + text=chunk.content, + metadata=metadata, + ) + + def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None: + policy_id = str(frontmatter.get("id") or "").strip() + applies_to = frontmatter.get("applies_to") or frontmatter.get("type") + if not policy_id: + return None + metadata = { + "policy_id": policy_id, + "applies_to": applies_to, + "rules": chunk.content[:4000], + "default_behaviors": frontmatter.get("default_behaviors") or [], + "doc_kind": doc_kind, + "section_path": chunk.section_path, + "source_path": source.path, + } + return RagDocument( + layer=RagLayer.DOCS_POLICY_INDEX, + source=source, + title=policy_id, + text=chunk.content[:4000], + metadata=metadata, + ) + + def build_fact( + self, + source: RagSource, + *, + subject_id: str, + predicate: str, + obj: str, + object_ref: str | None, + anchor: str, + tags: list[str] | None = None, + ) -> RagDocument: + fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest() + metadata = { + "fact_id": fact_id, + "subject_id": subject_id, + "predicate": predicate, + "object": obj, + "object_ref": object_ref, + "anchor": anchor, + "tags": tags or [], + "source_path": source.path, + } + return RagDocument( + layer=RagLayer.DOCS_FACT_INDEX, + source=source, + title=f"{subject_id}:{predicate}", + text=f"{subject_id} {predicate} {obj}".strip(), + metadata=metadata, + links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)], + ) diff --git a/app/modules/rag/indexing/docs/file_filter.py b/app/modules/rag/indexing/docs/file_filter.py new file mode 100644 index 0000000..c8ae4f7 --- /dev/null +++ b/app/modules/rag/indexing/docs/file_filter.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + + +class DocsFileFilter: + _EXTENSIONS = {".md", ".rst", ".txt", ".adoc"} + _NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK") + _EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"} + + def should_index(self, path: str) -> bool: + candidate = PurePosixPath(path) + if any(part in self._EXCLUDE_PARTS for part in candidate.parts): + return False + if candidate.suffix.lower() in self._EXTENSIONS: + return True + upper_name = candidate.name.upper() + if any(upper_name.startswith(prefix) for prefix in self._NAMES): + return True + joined = "/".join(candidate.parts).lower() + return any(marker in joined for marker in ("docs/", "doc/", "documentation/")) diff --git a/app/modules/rag/indexing/docs/pipeline.py b/app/modules/rag/indexing/docs/pipeline.py new file mode 100644 index 0000000..073a742 --- /dev/null +++ b/app/modules/rag/indexing/docs/pipeline.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import re + +import yaml + +from app.modules.rag.contracts import RagDocument, RagSource +from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker +from app.modules.rag.indexing.docs.classifier import DocsClassifier +from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder +from app.modules.rag.indexing.docs.file_filter import DocsFileFilter + + +class DocsIndexingPipeline: + def __init__(self) -> None: + self._filter = DocsFileFilter() + self._classifier = DocsClassifier() + self._chunker = MarkdownDocChunker() + self._builder = DocsDocumentBuilder() + + def supports(self, path: str) -> bool: + return self._filter.should_index(path) + + def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: + source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) + frontmatter, body = self._split_frontmatter(content) + doc_kind = self._classifier.classify(path) + sections = self._chunker.chunk(body) + summary_text = self._summary_from_sections(sections) + docs: list[RagDocument] = [] + module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind) + if module_doc is not None: + docs.append(module_doc) + for section in sections: + docs.append(self._builder.build_section(source, section, frontmatter, doc_kind)) + if str(frontmatter.get("type") or "").strip() == "policy": + for section in sections[:1]: + policy = self._builder.build_policy(source, frontmatter, section, doc_kind) + if policy is not None: + docs.append(policy) + docs.extend(self._extract_facts(source, frontmatter, sections)) + return docs + + def _split_frontmatter(self, content: str) -> tuple[dict, str]: + if not content.startswith("---\n"): + return {}, content + _, raw, body = content.split("---", 2) + payload = yaml.safe_load(raw) or {} + return payload if isinstance(payload, dict) else {}, body.strip() + + def _summary_from_sections(self, sections) -> str: + text = "\n\n".join(section.content for section in sections[:2]).strip() + return text[:4000] + + def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]: + subject_id = str(frontmatter.get("id") or source.path) + docs: list[RagDocument] = [] + links = frontmatter.get("links") or {} + for predicate, values in links.items(): + for value in values or []: + docs.append( + self._builder.build_fact( + source, + subject_id=subject_id, + predicate=predicate, + obj=str(value), + object_ref=str(value), + anchor="frontmatter.links", + ) + ) + for section in sections: + docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content)) + docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content)) + return docs + + def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: + lines = [line.strip() for line in content.splitlines() if line.strip()] + if len(lines) < 3 or "|" not in lines[0]: + return [] + headers = [part.strip().lower() for part in lines[0].strip("|").split("|")] + if not all(headers): + return [] + docs: list[RagDocument] = [] + for row in lines[2:]: + if "|" not in row: + continue + values = [part.strip() for part in row.strip("|").split("|")] + if len(values) != len(headers): + continue + payload = dict(zip(headers, values)) + docs.extend(self._facts_from_row(source, subject_id, title, payload)) + return docs + + def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]: + docs: list[RagDocument] = [] + name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code") + if "request" in title.lower() or "response" in title.lower(): + if name: + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title)) + if payload.get("required"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title)) + if payload.get("type"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title)) + if payload.get("validation"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title)) + if "error" in title.lower(): + if payload.get("status"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title)) + if payload.get("error") or payload.get("code"): + error_value = payload.get("error") or payload.get("code") + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title)) + if payload.get("client action"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title)) + if "constraint" in title.lower() and name: + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title)) + return docs + + def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: + docs: list[RagDocument] = [] + for line in content.splitlines(): + item = line.strip() + if not item.startswith(("-", "*", "1.", "2.", "3.")): + continue + normalized = re.sub(r"^[-*0-9. ]+", "", item).strip() + lowered = normalized.lower() + if lowered.startswith("metric:"): + predicate = "emits_metric" + elif lowered.startswith("event:"): + predicate = "emits_analytics_event" + elif lowered.startswith("log:"): + predicate = "logs_event" + else: + predicate = "validates_rule" if "rule" in title.lower() else "client_action" + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title)) + return docs diff --git a/app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc b/app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc new file mode 100644 index 0000000..e6fc935 Binary files /dev/null and b/app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc differ diff --git a/app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc b/app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc new file mode 100644 index 0000000..8320bca Binary files /dev/null and b/app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc differ diff --git a/app/modules/rag/persistence/cache_repository.py b/app/modules/rag/persistence/cache_repository.py new file mode 100644 index 0000000..bbfae35 --- /dev/null +++ b/app/modules/rag/persistence/cache_repository.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +import json + +from sqlalchemy import text + +from app.modules.rag.contracts import EvidenceLink, RagDocument, RagSource, RagSpan +from app.modules.rag.contracts.enums import RagLayer + + +class RagCacheRepository: + def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]: + with self._engine().connect() as conn: + rows = conn.execute( + text( + """ + SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end, + repo_id, commit_sha, embedding::text AS embedding_txt + FROM rag_chunk_cache + WHERE repo_id = :repo_id AND blob_sha = :blob_sha + ORDER BY chunk_index ASC + """ + ), + {"repo_id": repo_id, "blob_sha": blob_sha}, + ).mappings().fetchall() + docs: list[RagDocument] = [] + for row in rows: + metadata = self._loads(row.get("metadata_json")) + docs.append( + RagDocument( + layer=str(row["layer"]), + lang=row.get("lang"), + source=RagSource( + repo_id=str(row["repo_id"]), + commit_sha=row.get("commit_sha"), + path=str(row["path"]), + ), + title=str(row["title"] or row["path"]), + text=str(row["content"] or ""), + metadata=metadata, + links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])], + span=RagSpan(row.get("span_start"), row.get("span_end")), + embedding=self._parse_vector(str(row["embedding_txt"] or "")), + ) + ) + return docs + + def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None: + if not docs: + return + with self._engine().connect() as conn: + first = docs[0].to_record() + first_meta = first["metadata"] + conn.execute( + text( + """ + INSERT INTO rag_blob_cache ( + repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner, + system_component, last_modified, staleness_score, layer, lang, metadata_json + ) + VALUES ( + :repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner, + :system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json + ) + ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET + artifact_type = EXCLUDED.artifact_type, + section = EXCLUDED.section, + doc_id = EXCLUDED.doc_id, + doc_version = EXCLUDED.doc_version, + owner = EXCLUDED.owner, + system_component = EXCLUDED.system_component, + last_modified = EXCLUDED.last_modified, + staleness_score = EXCLUDED.staleness_score, + layer = EXCLUDED.layer, + lang = EXCLUDED.lang, + metadata_json = EXCLUDED.metadata_json, + updated_at = CURRENT_TIMESTAMP + """ + ), + { + "repo_id": repo_id, + "blob_sha": blob_sha, + "path": path, + "artifact_type": first_meta.get("artifact_type"), + "section": first_meta.get("section") or first_meta.get("section_title"), + "doc_id": first_meta.get("doc_id"), + "doc_version": first_meta.get("doc_version"), + "owner": first_meta.get("owner"), + "system_component": first_meta.get("system_component"), + "last_modified": first_meta.get("last_modified"), + "staleness_score": first_meta.get("staleness_score"), + "layer": first["layer"], + "lang": first["lang"], + "metadata_json": json.dumps(first_meta, ensure_ascii=True), + }, + ) + conn.execute( + text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"), + {"repo_id": repo_id, "blob_sha": blob_sha}, + ) + for idx, doc in enumerate(docs): + row = doc.to_record() + metadata = row["metadata"] + emb = row["embedding"] or [] + emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None + conn.execute( + text( + """ + INSERT INTO rag_chunk_cache ( + repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title, + metadata_json, links_json, span_start, span_end, commit_sha + ) + VALUES ( + :repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer, + :lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha + ) + """ + ), + { + "repo_id": repo_id, + "blob_sha": blob_sha, + "chunk_index": idx, + "content": row["text"], + "embedding": emb_str, + "section": metadata.get("section") or metadata.get("section_title"), + "layer": row["layer"], + "lang": row["lang"], + "path": row["path"], + "title": row["title"], + "metadata_json": json.dumps(metadata, ensure_ascii=True), + "links_json": json.dumps(row["links"], ensure_ascii=True), + "span_start": row["span_start"], + "span_end": row["span_end"], + "commit_sha": row["commit_sha"], + }, + ) + conn.commit() + + def record_repo_cache( + self, + *, + project_id: str, + commit_sha: str | None, + changed_files: list[str], + summary: str, + ) -> None: + docs: list[RagDocument] = [] + for idx, path in enumerate(changed_files): + docs.append( + RagDocument( + layer=RagLayer.CODE_SOURCE_CHUNKS, + lang="python" if path.endswith(".py") else None, + source=RagSource(project_id, commit_sha, path), + title=path, + text=f"repo_webhook:{path}:{summary[:300]}", + metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"}, + ) + ) + for doc in docs: + blob_sha = self._blob_sha(commit_sha, doc.source.path) + doc.metadata["blob_sha"] = blob_sha + self.cache_documents(project_id, doc.source.path, blob_sha, [doc]) + + def _blob_sha(self, commit_sha: str | None, path: str) -> str: + from hashlib import sha256 + + return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest() + + def _engine(self): + from app.modules.shared.db import get_engine + + return get_engine() + + def _loads(self, value, default=None): + if default is None: + default = {} + if not value: + return default + return json.loads(str(value)) + + def _parse_vector(self, value: str) -> list[float]: + text_value = value.strip() + if not text_value: + return [] + if text_value.startswith("[") and text_value.endswith("]"): + text_value = text_value[1:-1] + if not text_value: + return [] + return [float(part.strip()) for part in text_value.split(",") if part.strip()] diff --git a/app/modules/rag/persistence/document_repository.py b/app/modules/rag/persistence/document_repository.py new file mode 100644 index 0000000..b665094 --- /dev/null +++ b/app/modules/rag/persistence/document_repository.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import json + +from sqlalchemy import text + +from app.modules.rag.contracts import RagDocument + + +class RagDocumentRepository: + def replace_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None: + conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id}) + conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id}) + self.insert_documents(conn, rag_session_id, docs) + + def apply_document_changes( + self, + conn, + rag_session_id: str, + delete_paths: list[str], + docs: list[RagDocument], + ) -> None: + if delete_paths: + conn.execute( + text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": delete_paths}, + ) + conn.execute( + text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": delete_paths}, + ) + if not docs: + return + paths = sorted({doc.source.path for doc in docs}) + conn.execute( + text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": paths}, + ) + conn.execute( + text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": paths}, + ) + self.insert_documents(conn, rag_session_id, docs) + + def insert_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None: + for doc in docs: + row = doc.to_record() + metadata = row["metadata"] + links = row["links"] + emb = row["embedding"] or [] + emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None + conn.execute( + text( + """ + INSERT INTO rag_chunks ( + rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id, + doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at, + rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start, + span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind + ) + VALUES ( + :sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id, + :doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP, + CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json, + :links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type, + :module_id, :section_path, :doc_kind + ) + """ + ), + { + "sid": rag_session_id, + "path": row["path"], + "chunk_index": int(metadata.get("chunk_index", 0)), + "content": row["text"], + "emb": emb_str, + "artifact_type": metadata.get("artifact_type"), + "section": metadata.get("section") or metadata.get("section_title"), + "doc_id": metadata.get("doc_id"), + "doc_version": metadata.get("doc_version"), + "owner": metadata.get("owner"), + "system_component": metadata.get("system_component"), + "last_modified": metadata.get("last_modified"), + "staleness_score": metadata.get("staleness_score"), + "rag_doc_id": row["doc_id"], + "layer": row["layer"], + "lang": row["lang"], + "repo_id": row["repo_id"], + "commit_sha": row["commit_sha"], + "title": row["title"], + "metadata_json": json.dumps(metadata, ensure_ascii=True), + "links_json": json.dumps(links, ensure_ascii=True), + "span_start": row["span_start"], + "span_end": row["span_end"], + "symbol_id": metadata.get("symbol_id"), + "qname": metadata.get("qname"), + "kind": metadata.get("kind") or metadata.get("type"), + "framework": metadata.get("framework"), + "entrypoint_type": metadata.get("entry_type") or metadata.get("entrypoint_type"), + "module_id": metadata.get("module_id") or metadata.get("policy_id"), + "section_path": metadata.get("section_path"), + "doc_kind": metadata.get("doc_kind"), + }, + ) + repo_id = str(row["repo_id"] or "").strip() + blob_sha = str(metadata.get("blob_sha") or "").strip() + if repo_id and blob_sha: + conn.execute( + text( + """ + INSERT INTO rag_session_chunk_map ( + rag_session_id, repo_id, blob_sha, chunk_index, path + ) VALUES (:sid, :repo_id, :blob_sha, :chunk_index, :path) + """ + ), + { + "sid": rag_session_id, + "repo_id": repo_id, + "blob_sha": blob_sha, + "chunk_index": int(metadata.get("chunk_index", 0)), + "path": row["path"], + }, + ) diff --git a/app/modules/rag/persistence/job_repository.py b/app/modules/rag/persistence/job_repository.py new file mode 100644 index 0000000..72fba06 --- /dev/null +++ b/app/modules/rag/persistence/job_repository.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +@dataclass +class RagJobRow: + index_job_id: str + rag_session_id: str + status: str + indexed_files: int + failed_files: int + cache_hit_files: int + cache_miss_files: int + error_code: str | None + error_desc: str | None + error_module: str | None + + +class RagJobRepository: + def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status) + VALUES (:jid, :sid, :status) + """ + ), + {"jid": index_job_id, "sid": rag_session_id, "status": status}, + ) + conn.commit() + + def update_job( + self, + index_job_id: str, + *, + status: str, + indexed_files: int, + failed_files: int, + cache_hit_files: int = 0, + cache_miss_files: int = 0, + error_code: str | None = None, + error_desc: str | None = None, + error_module: str | None = None, + ) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + UPDATE rag_index_jobs + SET status = :status, + indexed_files = :indexed, + failed_files = :failed, + cache_hit_files = :cache_hit_files, + cache_miss_files = :cache_miss_files, + error_code = :ecode, + error_desc = :edesc, + error_module = :emodule, + updated_at = CURRENT_TIMESTAMP + WHERE index_job_id = :jid + """ + ), + { + "jid": index_job_id, + "status": status, + "indexed": indexed_files, + "failed": failed_files, + "cache_hit_files": cache_hit_files, + "cache_miss_files": cache_miss_files, + "ecode": error_code, + "edesc": error_desc, + "emodule": error_module, + }, + ) + conn.commit() + + def get_job(self, index_job_id: str) -> RagJobRow | None: + with get_engine().connect() as conn: + row = conn.execute( + text( + """ + SELECT index_job_id, rag_session_id, status, indexed_files, failed_files, + cache_hit_files, cache_miss_files, error_code, error_desc, error_module + FROM rag_index_jobs + WHERE index_job_id = :jid + """ + ), + {"jid": index_job_id}, + ).mappings().fetchone() + return RagJobRow(**dict(row)) if row else None diff --git a/app/modules/rag/persistence/query_repository.py b/app/modules/rag/persistence/query_repository.py new file mode 100644 index 0000000..4e0d320 --- /dev/null +++ b/app/modules/rag/persistence/query_repository.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import json + +from sqlalchemy import text + +from app.modules.rag.retrieval.query_terms import extract_query_terms +from app.modules.shared.db import get_engine + + +class RagQueryRepository: + def retrieve( + self, + rag_session_id: str, + query_embedding: list[float], + *, + query_text: str = "", + limit: int = 5, + layers: list[str] | None = None, + path_prefixes: list[str] | None = None, + prefer_non_tests: bool = False, + ) -> list[dict]: + emb = "[" + ",".join(str(x) for x in query_embedding) + "]" + filters = ["rag_session_id = :sid"] + params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit} + if layers: + filters.append("layer = ANY(:layers)") + params["layers"] = layers + if path_prefixes: + or_filters = [] + for idx, prefix in enumerate(path_prefixes): + key = f"path_{idx}" + params[key] = f"{prefix}%" + or_filters.append(f"path LIKE :{key}") + filters.append("(" + " OR ".join(or_filters) + ")") + term_filters = [] + terms = extract_query_terms(query_text) + for idx, term in enumerate(terms): + exact_key = f"term_exact_{idx}" + prefix_key = f"term_prefix_{idx}" + contains_key = f"term_contains_{idx}" + params[exact_key] = term + params[prefix_key] = f"{term}%" + params[contains_key] = f"%{term}%" + term_filters.append( + "CASE " + f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 " + f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 " + f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 " + f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 " + f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 " + f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 " + f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 " + "ELSE 100 END" + ) + lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100" + test_penalty_sql = ( + "CASE " + "WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' " + "THEN 1 ELSE 0 END" + if prefer_non_tests + else "0" + ) + layer_rank_sql = ( + "CASE " + "WHEN layer = 'C3_ENTRYPOINTS' THEN 0 " + "WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 " + "WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 " + "WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 " + "WHEN layer = 'D1_MODULE_CATALOG' THEN 0 " + "WHEN layer = 'D2_FACT_INDEX' THEN 1 " + "WHEN layer = 'D3_SECTION_INDEX' THEN 2 " + "WHEN layer = 'D4_POLICY_INDEX' THEN 3 " + "ELSE 10 END" + ) + sql = f""" + SELECT path, content, layer, title, metadata_json, span_start, span_end, + {lexical_sql} AS lexical_rank, + {test_penalty_sql} AS test_penalty, + {layer_rank_sql} AS layer_rank, + (embedding <=> CAST(:emb AS vector)) AS distance + FROM rag_chunks + WHERE {' AND '.join(filters)} + ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector) + LIMIT :lim + """ + with get_engine().connect() as conn: + rows = conn.execute(text(sql), params).mappings().fetchall() + return [self._row_to_dict(row) for row in rows] + + def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]: + filters = ["rag_session_id = :sid"] + params: dict = {"sid": rag_session_id, "lim": limit} + if layers: + filters.append("layer = ANY(:layers)") + params["layers"] = layers + sql = f""" + SELECT path, content, layer, title, metadata_json, span_start, span_end + FROM rag_chunks + WHERE {' AND '.join(filters)} + ORDER BY id DESC + LIMIT :lim + """ + with get_engine().connect() as conn: + rows = conn.execute(text(sql), params).mappings().fetchall() + return [self._row_to_dict(row) for row in rows] + + def _row_to_dict(self, row) -> dict: + data = dict(row) + data["metadata"] = json.loads(str(data.pop("metadata_json") or "{}")) + return data diff --git a/app/modules/rag/persistence/repository.py b/app/modules/rag/persistence/repository.py new file mode 100644 index 0000000..a8418f5 --- /dev/null +++ b/app/modules/rag/persistence/repository.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.persistence.cache_repository import RagCacheRepository +from app.modules.rag.persistence.document_repository import RagDocumentRepository +from app.modules.rag.persistence.job_repository import RagJobRepository, RagJobRow +from app.modules.rag.persistence.query_repository import RagQueryRepository +from app.modules.rag.persistence.schema_repository import RagSchemaRepository +from app.modules.rag.persistence.session_repository import RagSessionRepository +from app.modules.shared.db import get_engine + + +class RagRepository: + def __init__(self) -> None: + self._schema = RagSchemaRepository() + self._sessions = RagSessionRepository() + self._jobs = RagJobRepository() + self._documents = RagDocumentRepository() + self._cache = RagCacheRepository() + self._query = RagQueryRepository() + + def ensure_tables(self) -> None: + self._schema.ensure_tables() + + def upsert_session(self, rag_session_id: str, project_id: str) -> None: + self._sessions.upsert_session(rag_session_id, project_id) + + def session_exists(self, rag_session_id: str) -> bool: + return self._sessions.session_exists(rag_session_id) + + def get_session(self, rag_session_id: str) -> dict | None: + return self._sessions.get_session(rag_session_id) + + def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: + self._jobs.create_job(index_job_id, rag_session_id, status) + + def update_job(self, index_job_id: str, **kwargs) -> None: + self._jobs.update_job(index_job_id, **kwargs) + + def get_job(self, index_job_id: str) -> RagJobRow | None: + return self._jobs.get_job(index_job_id) + + def replace_documents(self, rag_session_id: str, docs: list[RagDocument]) -> None: + with get_engine().connect() as conn: + self._documents.replace_documents(conn, rag_session_id, docs) + conn.commit() + + def apply_document_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None: + with get_engine().connect() as conn: + self._documents.apply_document_changes(conn, rag_session_id, delete_paths, docs) + conn.commit() + + def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]: + return self._cache.get_cached_documents(repo_id, blob_sha) + + def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None: + self._cache.cache_documents(repo_id, path, blob_sha, docs) + + def record_repo_cache(self, **kwargs) -> None: + self._cache.record_repo_cache(**kwargs) + + def retrieve( + self, + rag_session_id: str, + query_embedding: list[float], + *, + query_text: str = "", + limit: int = 5, + layers: list[str] | None = None, + prefer_non_tests: bool = False, + ) -> list[dict]: + return self._query.retrieve( + rag_session_id, + query_embedding, + query_text=query_text, + limit=limit, + layers=layers, + prefer_non_tests=prefer_non_tests, + ) + + def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]: + return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers) diff --git a/app/modules/rag/persistence/schema_repository.py b/app/modules/rag/persistence/schema_repository.py new file mode 100644 index 0000000..5648165 --- /dev/null +++ b/app/modules/rag/persistence/schema_repository.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +class RagSchemaRepository: + def ensure_tables(self) -> None: + engine = get_engine() + with engine.connect() as conn: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_sessions ( + rag_session_id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(512) NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_index_jobs ( + index_job_id VARCHAR(64) PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + status VARCHAR(16) NOT NULL, + indexed_files INTEGER NOT NULL DEFAULT 0, + failed_files INTEGER NOT NULL DEFAULT 0, + cache_hit_files INTEGER NOT NULL DEFAULT 0, + cache_miss_files INTEGER NOT NULL DEFAULT 0, + error_code VARCHAR(128) NULL, + error_desc TEXT NULL, + error_module VARCHAR(64) NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_chunks ( + id BIGSERIAL PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + path TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + embedding vector NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_blob_cache ( + id BIGSERIAL PRIMARY KEY, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + path TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path) + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_chunk_cache ( + id BIGSERIAL PRIMARY KEY, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + embedding vector NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index) + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_session_chunk_map ( + id BIGSERIAL PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + chunk_index INTEGER NOT NULL, + path TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + self._ensure_columns(conn) + self._ensure_indexes(conn) + conn.commit() + + def _ensure_columns(self, conn) -> None: + for statement in ( + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS qname TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS kind TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS framework TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS entrypoint_type TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS module_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section_path TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_kind TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_id TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_version TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS owner TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS system_component TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS path TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS title TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS links_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_start INTEGER NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_end INTEGER NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL", + "ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0", + ): + conn.execute(text(statement)) + + def _ensure_indexes(self, conn) -> None: + for statement in ( + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer ON rag_chunks (rag_session_id, layer)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer_path ON rag_chunks (rag_session_id, layer, path)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_qname ON rag_chunks (qname)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_symbol_id ON rag_chunks (symbol_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_module_id ON rag_chunks (module_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc_kind ON rag_chunks (doc_kind)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_entrypoint ON rag_chunks (entrypoint_type, framework)", + "CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)", + "CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)", + ): + conn.execute(text(statement)) diff --git a/app/modules/rag/persistence/session_repository.py b/app/modules/rag/persistence/session_repository.py new file mode 100644 index 0000000..5c36a5d --- /dev/null +++ b/app/modules/rag/persistence/session_repository.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +class RagSessionRepository: + def upsert_session(self, rag_session_id: str, project_id: str) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + INSERT INTO rag_sessions (rag_session_id, project_id) + VALUES (:sid, :pid) + ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id + """ + ), + {"sid": rag_session_id, "pid": project_id}, + ) + conn.commit() + + def session_exists(self, rag_session_id: str) -> bool: + with get_engine().connect() as conn: + row = conn.execute( + text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"), + {"sid": rag_session_id}, + ).fetchone() + return bool(row) + + def get_session(self, rag_session_id: str) -> dict | None: + with get_engine().connect() as conn: + row = conn.execute( + text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"), + {"sid": rag_session_id}, + ).mappings().fetchone() + return dict(row) if row else None diff --git a/app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc b/app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc new file mode 100644 index 0000000..9127155 Binary files /dev/null and b/app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc differ diff --git a/app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc b/app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc new file mode 100644 index 0000000..74a313e Binary files /dev/null and b/app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc differ diff --git a/app/modules/rag/retrieval/query_router.py b/app/modules/rag/retrieval/query_router.py new file mode 100644 index 0000000..eaa153c --- /dev/null +++ b/app/modules/rag/retrieval/query_router.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagLayer, RetrievalMode + + +class RagQueryRouter: + _CODE_HINTS = ( + "как работает код", + "explain code", + "explain the code", + "по коду", + "из кода", + "построй документацию по коду", + "документацию по коду", + "where is implemented", + "где реализовано", + "endpoint", + "handler", + "symbol", + "function", + "class", + "method", + ) + + _DOCS_LAYERS = [ + RagLayer.DOCS_MODULE_CATALOG, + RagLayer.DOCS_FACT_INDEX, + RagLayer.DOCS_SECTION_INDEX, + RagLayer.DOCS_POLICY_INDEX, + ] + _CODE_LAYERS = [ + RagLayer.CODE_ENTRYPOINTS, + RagLayer.CODE_SYMBOL_CATALOG, + RagLayer.CODE_DEPENDENCY_GRAPH, + RagLayer.CODE_SOURCE_CHUNKS, + ] + + def resolve_mode(self, query: str) -> str: + lowered = query.lower() + return RetrievalMode.CODE if any(hint in lowered for hint in self._CODE_HINTS) else RetrievalMode.DOCS + + def layers_for_mode(self, mode: str) -> list[str]: + return list(self._CODE_LAYERS if mode == RetrievalMode.CODE else self._DOCS_LAYERS) diff --git a/app/modules/rag/retrieval/query_terms.py b/app/modules/rag/retrieval/query_terms.py new file mode 100644 index 0000000..13c7a25 --- /dev/null +++ b/app/modules/rag/retrieval/query_terms.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import re + + +def extract_query_terms(query_text: str) -> list[str]: + raw_terms = re.findall(r"[A-Za-z_][A-Za-z0-9_]{2,}", query_text or "") + normalized: list[str] = [] + for term in raw_terms: + for variant in _identifier_variants(term): + if variant not in normalized: + normalized.append(variant) + for variant in _intent_variants(query_text): + if variant not in normalized: + normalized.append(variant) + return normalized[:6] + + +def _identifier_variants(term: str) -> list[str]: + lowered = term.lower() + variants = [lowered] + snake = _camel_to_snake(term) + if snake and snake not in variants: + variants.append(snake) + if lowered.endswith("manager") and len(lowered) > len("manager"): + manager_split = lowered[: -len("manager")] + "_manager" + if manager_split not in variants: + variants.append(manager_split) + compact = snake.replace("_", "") if snake else "" + if compact and compact not in variants: + variants.append(compact) + return variants + + +def _camel_to_snake(term: str) -> str: + first = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", term) + return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", first).lower() + + +def _intent_variants(query_text: str) -> list[str]: + lowered = (query_text or "").lower() + variants: list[str] = [] + if any(token in lowered for token in ("управ", "control", "manage", "management")): + variants.extend(["control", "management", "start", "stop", "status"]) + return variants diff --git a/app/modules/rag/services/rag_service.py b/app/modules/rag/services/rag_service.py new file mode 100644 index 0000000..ae2c996 --- /dev/null +++ b/app/modules/rag/services/rag_service.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import asyncio +import hashlib +import os +from collections.abc import Awaitable, Callable +from inspect import isawaitable + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline +from app.modules.rag.indexing.common.report import IndexReport +from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline +from app.modules.rag.persistence.repository import RagRepository +from app.modules.rag.retrieval.query_router import RagQueryRouter +from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder + + +class RagService: + def __init__( + self, + embedder: GigaChatEmbedder, + repository: RagRepository, + chunker=None, + ) -> None: + self._embedder = embedder + self._repo = repository + self._docs = DocsIndexingPipeline() + self._code = CodeIndexingPipeline() + self._queries = RagQueryRouter() + + async def index_snapshot( + self, + rag_session_id: str, + files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> tuple[int, int, int, int]: + report = await self._index_files(rag_session_id, files, progress_cb=progress_cb) + self._repo.replace_documents(rag_session_id, report.documents_list) + return report.as_tuple() + + async def index_changes( + self, + rag_session_id: str, + changed_files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> tuple[int, int, int, int]: + delete_paths: list[str] = [] + upserts: list[dict] = [] + for item in changed_files: + if str(item.get("op")) == "delete": + delete_paths.append(str(item.get("path", ""))) + else: + upserts.append(item) + report = await self._index_files(rag_session_id, upserts, progress_cb=progress_cb) + self._repo.apply_document_changes(rag_session_id, delete_paths, report.documents_list) + return report.as_tuple() + + async def retrieve(self, rag_session_id: str, query: str) -> list[dict]: + mode = self._queries.resolve_mode(query) + layers = self._queries.layers_for_mode(mode) + prefer_non_tests = mode == "code" and "test" not in query.lower() and "тест" not in query.lower() + try: + query_embedding = self._embedder.embed([query])[0] + rows = self._repo.retrieve( + rag_session_id, + query_embedding, + query_text=query, + limit=8, + layers=layers, + prefer_non_tests=prefer_non_tests, + ) + except Exception: + rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=layers) + if not rows and mode != "docs": + rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=self._queries.layers_for_mode("docs")) + return [ + { + "source": row["path"], + "content": row["content"], + "layer": row.get("layer"), + "title": row.get("title"), + "metadata": row.get("metadata", {}), + "score": row.get("distance"), + } + for row in rows + ] + + async def _index_files( + self, + rag_session_id: str, + files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> "_PipelineReport": + total_files = len(files) + report = _PipelineReport() + repo_id = self._resolve_repo_id(rag_session_id) + for index, file in enumerate(files, start=1): + path = str(file.get("path", "")) + try: + blob_sha = self._blob_sha(file) + cached = await asyncio.to_thread(self._repo.get_cached_documents, repo_id, blob_sha) + if cached: + report.documents_list.extend(self._with_file_metadata(cached, file, repo_id, blob_sha)) + report.cache_hit_files += 1 + else: + built = self._build_documents(repo_id, path, file) + embedded = await asyncio.to_thread(self._embed_documents, built, file, repo_id, blob_sha) + report.documents_list.extend(embedded) + await asyncio.to_thread(self._repo.cache_documents, repo_id, path, blob_sha, embedded) + report.cache_miss_files += 1 + report.indexed_files += 1 + except Exception as exc: + report.failed_files += 1 + report.warnings.append(f"{path}: {exc}") + await self._notify_progress(progress_cb, index, total_files, path) + report.documents = len(report.documents_list) + return report + + def _build_documents(self, repo_id: str, path: str, file: dict) -> list[RagDocument]: + content = str(file.get("content") or "") + commit_sha = file.get("commit_sha") + docs: list[RagDocument] = [] + if self._docs.supports(path): + docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + if self._code.supports(path): + docs.extend(self._code.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + if not docs: + docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + return docs + + def _embed_documents(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: + if not docs: + return [] + batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16"))) + metadata = self._document_metadata(file, repo_id, blob_sha) + for doc in docs: + doc.metadata.update(metadata) + for start in range(0, len(docs), batch_size): + batch = docs[start : start + batch_size] + vectors = self._embedder.embed([doc.text for doc in batch]) + for doc, vector in zip(batch, vectors): + doc.embedding = vector + return docs + + def _with_file_metadata(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: + metadata = self._document_metadata(file, repo_id, blob_sha) + for doc in docs: + doc.metadata.update(metadata) + doc.source.repo_id = repo_id + doc.source.path = str(file.get("path", doc.source.path)) + return docs + + def _document_metadata(self, file: dict, repo_id: str, blob_sha: str) -> dict: + return { + "blob_sha": blob_sha, + "repo_id": repo_id, + "artifact_type": file.get("artifact_type"), + "section": file.get("section"), + "doc_id": file.get("doc_id"), + "doc_version": file.get("doc_version"), + "owner": file.get("owner"), + "system_component": file.get("system_component"), + "last_modified": file.get("last_modified"), + "staleness_score": file.get("staleness_score"), + } + + def _resolve_repo_id(self, rag_session_id: str) -> str: + session = self._repo.get_session(rag_session_id) + if not session: + return rag_session_id + return str(session.get("project_id") or rag_session_id) + + def _blob_sha(self, file: dict) -> str: + raw = str(file.get("content_hash") or "").strip() + if raw: + return raw + content = str(file.get("content") or "") + return hashlib.sha256(content.encode("utf-8")).hexdigest() + + async def _notify_progress( + self, + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None, + current_file_index: int, + total_files: int, + current_file_name: str, + ) -> None: + if not progress_cb: + return + result = progress_cb(current_file_index, total_files, current_file_name) + if isawaitable(result): + await result + + +class _PipelineReport(IndexReport): + def __init__(self) -> None: + super().__init__() + self.documents_list: list[RagDocument] = [] diff --git a/app/modules/rag_repo/module.py b/app/modules/rag_repo/module.py index 15f7c86..0f3c095 100644 --- a/app/modules/rag_repo/module.py +++ b/app/modules/rag_repo/module.py @@ -3,8 +3,8 @@ from __future__ import annotations from fastapi import APIRouter, Request from app.modules.agent.story_context_repository import StoryContextRepository +from app.modules.rag.persistence.repository import RagRepository from app.modules.rag_repo.webhook_service import RepoWebhookService -from app.modules.rag_session.repository import RagRepository class RagRepoModule: diff --git a/app/modules/rag_session/repository.py b/app/modules/rag_session/repository.py index bdeca25..a90d628 100644 --- a/app/modules/rag_session/repository.py +++ b/app/modules/rag_session/repository.py @@ -1,660 +1,4 @@ -from __future__ import annotations +from app.modules.rag.persistence.job_repository import RagJobRow +from app.modules.rag.persistence.repository import RagRepository -from dataclasses import dataclass -import hashlib -from sqlalchemy import text - -from app.modules.shared.db import get_engine - - -@dataclass -class RagJobRow: - index_job_id: str - rag_session_id: str - status: str - indexed_files: int - failed_files: int - cache_hit_files: int - cache_miss_files: int - error_code: str | None - error_desc: str | None - error_module: str | None - - -class RagRepository: - def ensure_tables(self) -> None: - engine = get_engine() - with engine.connect() as conn: - conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_sessions ( - rag_session_id VARCHAR(64) PRIMARY KEY, - project_id VARCHAR(512) NOT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_index_jobs ( - index_job_id VARCHAR(64) PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - status VARCHAR(16) NOT NULL, - indexed_files INTEGER NOT NULL DEFAULT 0, - failed_files INTEGER NOT NULL DEFAULT 0, - cache_hit_files INTEGER NOT NULL DEFAULT 0, - cache_miss_files INTEGER NOT NULL DEFAULT 0, - error_code VARCHAR(128) NULL, - error_desc TEXT NULL, - error_module VARCHAR(64) NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_chunks ( - id BIGSERIAL PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - path TEXT NOT NULL, - chunk_index INTEGER NOT NULL, - content TEXT NOT NULL, - embedding vector NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL")) - conn.execute( - text( - """ - ALTER TABLE rag_chunks - ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - """ - ) - ) - conn.execute( - text( - """ - ALTER TABLE rag_chunks - ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_blob_cache ( - id BIGSERIAL PRIMARY KEY, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - path TEXT NOT NULL, - artifact_type VARCHAR(16) NULL, - section TEXT NULL, - doc_id TEXT NULL, - doc_version TEXT NULL, - owner TEXT NULL, - system_component TEXT NULL, - last_modified TIMESTAMPTZ NULL, - staleness_score DOUBLE PRECISION NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path) - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_chunk_cache ( - id BIGSERIAL PRIMARY KEY, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - chunk_index INTEGER NOT NULL, - content TEXT NOT NULL, - embedding vector NULL, - section TEXT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index) - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_session_chunk_map ( - id BIGSERIAL PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - chunk_index INTEGER NOT NULL, - path TEXT NOT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)")) - conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0")) - conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0")) - conn.commit() - - def upsert_session(self, rag_session_id: str, project_id: str) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_sessions (rag_session_id, project_id) - VALUES (:sid, :pid) - ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id - """ - ), - {"sid": rag_session_id, "pid": project_id}, - ) - conn.commit() - - def session_exists(self, rag_session_id: str) -> bool: - with get_engine().connect() as conn: - row = conn.execute( - text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"), - {"sid": rag_session_id}, - ).fetchone() - return bool(row) - - def get_session(self, rag_session_id: str) -> dict | None: - with get_engine().connect() as conn: - row = conn.execute( - text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"), - {"sid": rag_session_id}, - ).mappings().fetchone() - return dict(row) if row else None - - def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status) - VALUES (:jid, :sid, :status) - """ - ), - {"jid": index_job_id, "sid": rag_session_id, "status": status}, - ) - conn.commit() - - def update_job( - self, - index_job_id: str, - *, - status: str, - indexed_files: int, - failed_files: int, - cache_hit_files: int = 0, - cache_miss_files: int = 0, - error_code: str | None = None, - error_desc: str | None = None, - error_module: str | None = None, - ) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - UPDATE rag_index_jobs - SET status = :status, - indexed_files = :indexed, - failed_files = :failed, - cache_hit_files = :cache_hit_files, - cache_miss_files = :cache_miss_files, - error_code = :ecode, - error_desc = :edesc, - error_module = :emodule, - updated_at = CURRENT_TIMESTAMP - WHERE index_job_id = :jid - """ - ), - { - "jid": index_job_id, - "status": status, - "indexed": indexed_files, - "failed": failed_files, - "cache_hit_files": cache_hit_files, - "cache_miss_files": cache_miss_files, - "ecode": error_code, - "edesc": error_desc, - "emodule": error_module, - }, - ) - conn.commit() - - def get_job(self, index_job_id: str) -> RagJobRow | None: - with get_engine().connect() as conn: - row = conn.execute( - text( - """ - SELECT index_job_id, rag_session_id, status, indexed_files, failed_files, - cache_hit_files, cache_miss_files, error_code, error_desc, error_module - FROM rag_index_jobs - WHERE index_job_id = :jid - """ - ), - {"jid": index_job_id}, - ).mappings().fetchone() - if not row: - return None - return RagJobRow(**dict(row)) - - def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None: - with get_engine().connect() as conn: - conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id}) - conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id}) - self._insert_chunks(conn, rag_session_id, items) - conn.commit() - - def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None: - with get_engine().connect() as conn: - if delete_paths: - conn.execute( - text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": delete_paths}, - ) - conn.execute( - text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": delete_paths}, - ) - if upserts: - paths = sorted({str(x["path"]) for x in upserts}) - conn.execute( - text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": paths}, - ) - conn.execute( - text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": paths}, - ) - self._insert_chunks(conn, rag_session_id, upserts) - conn.commit() - - def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]: - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT chunk_index, content, embedding::text AS embedding_txt, section - FROM rag_chunk_cache - WHERE repo_id = :repo_id AND blob_sha = :blob_sha - ORDER BY chunk_index ASC - """ - ), - {"repo_id": repo_id, "blob_sha": blob_sha}, - ).mappings().fetchall() - output: list[dict] = [] - for row in rows: - output.append( - { - "chunk_index": int(row["chunk_index"]), - "content": str(row["content"] or ""), - "embedding": self._parse_vector(str(row["embedding_txt"] or "")), - "section": row.get("section"), - } - ) - return output - - def record_repo_cache( - self, - *, - project_id: str, - commit_sha: str | None, - changed_files: list[str], - summary: str, - ) -> None: - repo_session_id = f"repo:{project_id}" - with get_engine().connect() as conn: - for path in changed_files: - key = f"{commit_sha or 'no-commit'}:{path}" - blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest() - conn.execute( - text( - """ - INSERT INTO rag_blob_cache ( - repo_id, - blob_sha, - path, - artifact_type, - section - ) - VALUES ( - :repo_id, - :blob_sha, - :path, - :artifact_type, - :section - ) - ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": project_id, - "blob_sha": blob_sha, - "path": path, - "artifact_type": "CODE", - "section": "repo_webhook", - }, - ) - conn.execute( - text( - """ - INSERT INTO rag_chunk_cache ( - repo_id, - blob_sha, - chunk_index, - content, - embedding, - section - ) - VALUES ( - :repo_id, - :blob_sha, - 0, - :content, - NULL, - :section - ) - ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET - content = EXCLUDED.content, - section = EXCLUDED.section, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": project_id, - "blob_sha": blob_sha, - "content": f"repo_webhook:{path}:{summary[:300]}", - "section": "repo_webhook", - }, - ) - conn.execute( - text( - """ - INSERT INTO rag_session_chunk_map ( - rag_session_id, - repo_id, - blob_sha, - chunk_index, - path - ) - VALUES ( - :rag_session_id, - :repo_id, - :blob_sha, - 0, - :path - ) - """ - ), - { - "rag_session_id": repo_session_id, - "repo_id": project_id, - "blob_sha": blob_sha, - "path": path, - }, - ) - conn.commit() - - def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None: - if not items: - return - meta = items[0] - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_blob_cache ( - repo_id, - blob_sha, - path, - artifact_type, - section, - doc_id, - doc_version, - owner, - system_component, - last_modified, - staleness_score - ) - VALUES ( - :repo_id, - :blob_sha, - :path, - :artifact_type, - :section, - :doc_id, - :doc_version, - :owner, - :system_component, - :last_modified, - :staleness_score - ) - ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET - artifact_type = EXCLUDED.artifact_type, - section = EXCLUDED.section, - doc_id = EXCLUDED.doc_id, - doc_version = EXCLUDED.doc_version, - owner = EXCLUDED.owner, - system_component = EXCLUDED.system_component, - last_modified = EXCLUDED.last_modified, - staleness_score = EXCLUDED.staleness_score, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": repo_id, - "blob_sha": blob_sha, - "path": path, - "artifact_type": meta.get("artifact_type"), - "section": meta.get("section"), - "doc_id": meta.get("doc_id"), - "doc_version": meta.get("doc_version"), - "owner": meta.get("owner"), - "system_component": meta.get("system_component"), - "last_modified": meta.get("last_modified"), - "staleness_score": meta.get("staleness_score"), - }, - ) - for item in items: - emb = item.get("embedding") or [] - emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None - conn.execute( - text( - """ - INSERT INTO rag_chunk_cache ( - repo_id, - blob_sha, - chunk_index, - content, - embedding, - section - ) - VALUES ( - :repo_id, - :blob_sha, - :chunk_index, - :content, - CAST(:embedding AS vector), - :section - ) - ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET - content = EXCLUDED.content, - embedding = EXCLUDED.embedding, - section = EXCLUDED.section, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": repo_id, - "blob_sha": blob_sha, - "chunk_index": int(item["chunk_index"]), - "content": item["content"], - "embedding": emb_str, - "section": item.get("section"), - }, - ) - conn.commit() - - def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]: - emb = "[" + ",".join(str(x) for x in query_embedding) + "]" - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT path, content - FROM rag_chunks - WHERE rag_session_id = :sid - ORDER BY embedding <=> CAST(:emb AS vector) - LIMIT :lim - """ - ), - {"sid": rag_session_id, "emb": emb, "lim": limit}, - ).mappings().fetchall() - return [dict(x) for x in rows] - - def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]: - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT path, content - FROM rag_chunks - WHERE rag_session_id = :sid - ORDER BY id DESC - LIMIT :lim - """ - ), - {"sid": rag_session_id, "lim": limit}, - ).mappings().fetchall() - return [dict(x) for x in rows] - - def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None: - for item in items: - emb = item.get("embedding") or [] - emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None - conn.execute( - text( - """ - INSERT INTO rag_chunks ( - rag_session_id, - path, - chunk_index, - content, - embedding, - artifact_type, - section, - doc_id, - doc_version, - owner, - system_component, - last_modified, - staleness_score, - created_at, - updated_at - ) - VALUES ( - :sid, - :path, - :idx, - :content, - CAST(:emb AS vector), - :artifact_type, - :section, - :doc_id, - :doc_version, - :owner, - :system_component, - :last_modified, - :staleness_score, - CURRENT_TIMESTAMP, - CURRENT_TIMESTAMP - ) - """ - ), - { - "sid": rag_session_id, - "path": item["path"], - "idx": int(item["chunk_index"]), - "content": item["content"], - "emb": emb_str, - "artifact_type": item.get("artifact_type"), - "section": item.get("section"), - "doc_id": item.get("doc_id"), - "doc_version": item.get("doc_version"), - "owner": item.get("owner"), - "system_component": item.get("system_component"), - "last_modified": item.get("last_modified"), - "staleness_score": item.get("staleness_score"), - }, - ) - repo_id = str(item.get("repo_id") or "").strip() - blob_sha = str(item.get("blob_sha") or "").strip() - if repo_id and blob_sha: - conn.execute( - text( - """ - INSERT INTO rag_session_chunk_map ( - rag_session_id, - repo_id, - blob_sha, - chunk_index, - path - ) VALUES ( - :sid, - :repo_id, - :blob_sha, - :chunk_index, - :path - ) - """ - ), - { - "sid": rag_session_id, - "repo_id": repo_id, - "blob_sha": blob_sha, - "chunk_index": int(item["chunk_index"]), - "path": item["path"], - }, - ) - - def _parse_vector(self, value: str) -> list[float]: - text_value = value.strip() - if not text_value: - return [] - if text_value.startswith("[") and text_value.endswith("]"): - text_value = text_value[1:-1] - if not text_value: - return [] - return [float(part.strip()) for part in text_value.split(",") if part.strip()] +__all__ = ["RagJobRow", "RagRepository"] diff --git a/app/modules/rag_session/service.py b/app/modules/rag_session/service.py index 89bcf84..acae846 100644 --- a/app/modules/rag_session/service.py +++ b/app/modules/rag_session/service.py @@ -1,211 +1,3 @@ -import asyncio -import hashlib -import os -from collections.abc import Awaitable, Callable -from inspect import isawaitable +from app.modules.rag.services.rag_service import RagService -from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder -from app.modules.rag_session.repository import RagRepository -from app.modules.rag_session.retrieval.chunker import TextChunker - - -class RagService: - def __init__( - self, - embedder: GigaChatEmbedder, - repository: RagRepository, - chunker: TextChunker | None = None, - ) -> None: - self._embedder = embedder - self._repo = repository - self._chunker = chunker or TextChunker() - - async def index_snapshot( - self, - rag_session_id: str, - files: list[dict], - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, - ) -> tuple[int, int, int, int]: - total_files = len(files) - indexed_files = 0 - failed_files = 0 - cache_hit_files = 0 - cache_miss_files = 0 - all_chunks: list[dict] = [] - repo_id = self._resolve_repo_id(rag_session_id) - for index, file in enumerate(files, start=1): - path = str(file.get("path", "")) - try: - blob_sha = self._blob_sha(file) - cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha) - if cached: - all_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached)) - cache_hit_files += 1 - else: - chunks = self._build_chunks_for_file(file) - embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha) - all_chunks.extend(embedded_chunks) - await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks) - cache_miss_files += 1 - indexed_files += 1 - except Exception: - failed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - await asyncio.to_thread(self._repo.replace_chunks, rag_session_id, all_chunks) - return indexed_files, failed_files, cache_hit_files, cache_miss_files - - async def index_changes( - self, - rag_session_id: str, - changed_files: list[dict], - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, - ) -> tuple[int, int, int, int]: - total_files = len(changed_files) - indexed_files = 0 - failed_files = 0 - cache_hit_files = 0 - cache_miss_files = 0 - delete_paths: list[str] = [] - upsert_chunks: list[dict] = [] - repo_id = self._resolve_repo_id(rag_session_id) - - for index, file in enumerate(changed_files, start=1): - path = str(file.get("path", "")) - op = str(file.get("op", "")) - try: - if op == "delete": - delete_paths.append(path) - indexed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - continue - if op == "upsert" and file.get("content") is not None: - blob_sha = self._blob_sha(file) - cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha) - if cached: - upsert_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached)) - cache_hit_files += 1 - else: - chunks = self._build_chunks_for_file(file) - embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha) - upsert_chunks.extend(embedded_chunks) - await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks) - cache_miss_files += 1 - indexed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - continue - failed_files += 1 - except Exception: - failed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - - await asyncio.to_thread( - self._repo.apply_changes, - rag_session_id, - delete_paths, - upsert_chunks, - ) - return indexed_files, failed_files, cache_hit_files, cache_miss_files - - async def retrieve(self, rag_session_id: str, query: str) -> list[dict]: - try: - query_embedding = self._embedder.embed([query])[0] - rows = self._repo.retrieve(rag_session_id, query_embedding, limit=5) - except Exception: - rows = self._repo.fallback_chunks(rag_session_id, limit=5) - return [{"source": row["path"], "content": row["content"]} for row in rows] - - def _build_chunks_for_file(self, file: dict) -> list[tuple[str, int, str]]: - path = str(file.get("path", "")) - content = str(file.get("content", "")) - output: list[tuple[str, int, str]] = [] - for idx, chunk in enumerate(self._chunker.chunk(content)): - output.append((path, idx, chunk)) - return output - - def _embed_chunks(self, raw_chunks: list[tuple[str, int, str]], file: dict, repo_id: str, blob_sha: str) -> list[dict]: - if not raw_chunks: - return [] - batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16"))) - metadata = self._chunk_metadata(file) - - indexed: list[dict] = [] - for i in range(0, len(raw_chunks), batch_size): - batch = raw_chunks[i : i + batch_size] - texts = [x[2] for x in batch] - vectors = self._embedder.embed(texts) - for (path, chunk_index, content), vector in zip(batch, vectors): - indexed.append( - { - "path": path, - "chunk_index": chunk_index, - "content": content, - "embedding": vector, - "repo_id": repo_id, - "blob_sha": blob_sha, - **metadata, - } - ) - return indexed - - def _build_cached_items( - self, - path: str, - file: dict, - repo_id: str, - blob_sha: str, - cached: list[dict], - ) -> list[dict]: - metadata = self._chunk_metadata(file) - output: list[dict] = [] - for item in cached: - output.append( - { - "path": path, - "chunk_index": int(item["chunk_index"]), - "content": str(item["content"]), - "embedding": item.get("embedding") or [], - "repo_id": repo_id, - "blob_sha": blob_sha, - **metadata, - "section": item.get("section") or metadata.get("section"), - } - ) - return output - - def _resolve_repo_id(self, rag_session_id: str) -> str: - session = self._repo.get_session(rag_session_id) - if not session: - return rag_session_id - return str(session.get("project_id") or rag_session_id) - - def _blob_sha(self, file: dict) -> str: - raw = str(file.get("content_hash") or "").strip() - if raw: - return raw - content = str(file.get("content") or "") - return hashlib.sha256(content.encode("utf-8")).hexdigest() - - def _chunk_metadata(self, file: dict) -> dict: - return { - "artifact_type": file.get("artifact_type"), - "section": file.get("section"), - "doc_id": file.get("doc_id"), - "doc_version": file.get("doc_version"), - "owner": file.get("owner"), - "system_component": file.get("system_component"), - "last_modified": file.get("last_modified"), - "staleness_score": file.get("staleness_score"), - } - - async def _notify_progress( - self, - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None, - current_file_index: int, - total_files: int, - current_file_name: str, - ) -> None: - if not progress_cb: - return - result = progress_cb(current_file_index, total_files, current_file_name) - if isawaitable(result): - await result +__all__ = ["RagService"] diff --git a/tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc b/tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..29323aa Binary files /dev/null and b/tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/agent/orchestrator/test_explain_actions.py b/tests/agent/orchestrator/test_explain_actions.py new file mode 100644 index 0000000..4cd41c9 --- /dev/null +++ b/tests/agent/orchestrator/test_explain_actions.py @@ -0,0 +1,131 @@ +from app.modules.agent.engine.orchestrator.actions.explain_actions import ExplainActions +from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext +from app.modules.agent.engine.orchestrator.models import ( + ExecutionPlan, + OutputContract, + RoutingMeta, + Scenario, + TaskConstraints, + TaskSpec, +) + + +def _ctx(rag_items: list[dict]) -> ExecutionContext: + task = TaskSpec( + task_id="task-1", + dialog_session_id="dialog-1", + rag_session_id="rag-1", + user_message="Объясни по коду как работает task_processor", + scenario=Scenario.EXPLAIN_PART, + routing=RoutingMeta(domain_id="project", process_id="qa", confidence=0.9, reason="test"), + constraints=TaskConstraints(), + output_contract=OutputContract(result_type="answer"), + metadata={ + "rag_items": rag_items, + "rag_context": "", + "confluence_context": "", + "files_map": {}, + }, + ) + plan = ExecutionPlan( + plan_id="plan-1", + task_id="task-1", + scenario=Scenario.EXPLAIN_PART, + template_id="tpl", + template_version="1", + steps=[], + ) + return ExecutionContext(task=task, plan=plan, graph_resolver=lambda *_: None, graph_invoker=lambda *_: {}) + + +def test_explain_actions_switch_to_code_profile_when_code_layers_present() -> None: + ctx = _ctx( + [ + { + "source": "app/task_processor.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "task_processor.process_task", + "content": "function task_processor.process_task(task)", + "metadata": {"qname": "task_processor.process_task", "kind": "function"}, + }, + { + "source": "app/task_processor.py", + "layer": "C2_DEPENDENCY_GRAPH", + "title": "task_processor.process_task:calls", + "content": "task_processor.process_task calls queue.publish", + "metadata": {"edge_type": "calls"}, + }, + ] + ) + actions = ExplainActions() + + actions.collect_sources(ctx) + actions.extract_logic(ctx) + actions.summarize(ctx) + + sources = ctx.artifacts.get_content("sources", {}) + assert sources["source_profile"] == "code" + answer = str(ctx.artifacts.get_content("final_answer", "")) + assert "кодовых слоев индекса" not in answer + assert "CodeRAG" not in answer + assert "app/task_processor.py" in answer + assert "requirements/docs context" not in answer + + +def test_explain_actions_add_code_details_block() -> None: + ctx = _ctx( + [ + { + "source": "src/config_manager/__init__.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ConfigManager", + "content": "const ConfigManager\nConfigManager = config_manager.v2.ConfigManagerV2", + "metadata": { + "qname": "ConfigManager", + "kind": "const", + "lang_payload": {"imported_from": "v2.ConfigManagerV2", "import_alias": True}, + }, + }, + { + "source": "src/config_manager/v2/control/base.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ControlChannel", + "content": "class ControlChannel\nControlChannel(ABC)", + "metadata": {"qname": "ControlChannel", "kind": "class"}, + }, + { + "source": "src/config_manager/v2/core/control_bridge.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ControlChannelBridge", + "content": "class ControlChannelBridge\nПредоставляет halt и status как обработчики start/stop/status", + "metadata": {"qname": "ControlChannelBridge", "kind": "class"}, + }, + { + "source": "src/config_manager/v2/core/control_bridge.py", + "layer": "C2_DEPENDENCY_GRAPH", + "title": "ControlChannelBridge.on_start:calls", + "content": "ControlChannelBridge.on_start calls self._start_runtime", + "metadata": {"src_qname": "ControlChannelBridge.on_start", "dst_ref": "self._start_runtime"}, + }, + { + "source": "src/config_manager/v2/__init__.py", + "layer": "C0_SOURCE_CHUNKS", + "title": "src/config_manager/v2/__init__.py:1-6", + "content": '"""Контракт: управление через API (config.yaml, секция management)."""', + "metadata": {}, + }, + ] + ) + actions = ExplainActions() + + actions.collect_sources(ctx) + actions.extract_logic(ctx) + actions.summarize(ctx) + + answer = str(ctx.artifacts.get_content("final_answer", "")) + assert "### Что видно по коду" in answer + assert "ConfigManager` в проекте доступен как alias" in answer + assert "ControlChannelBridge.on_start" in answer + assert "### Где смотреть в проекте" in answer + assert "В индексе нет точного символа" not in answer + assert "отдельный интерфейс управления" in answer diff --git a/tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..7db10f9 Binary files /dev/null and b/tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/__pycache__/test_docs_indexing_pipeline.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_docs_indexing_pipeline.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..35a6e3e Binary files /dev/null and b/tests/rag/__pycache__/test_docs_indexing_pipeline.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..2ec8529 Binary files /dev/null and b/tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..a800eba Binary files /dev/null and b/tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/__pycache__/test_query_router.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_query_router.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..26edf90 Binary files /dev/null and b/tests/rag/__pycache__/test_query_router.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/__pycache__/test_query_terms.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_query_terms.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..88d1597 Binary files /dev/null and b/tests/rag/__pycache__/test_query_terms.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/rag/test_code_indexing_pipeline.py b/tests/rag/test_code_indexing_pipeline.py new file mode 100644 index 0000000..9ba4bf9 --- /dev/null +++ b/tests/rag/test_code_indexing_pipeline.py @@ -0,0 +1,57 @@ +from app.modules.rag.contracts.enums import RagLayer +from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline + + +def test_code_pipeline_builds_source_symbols_edges_and_entrypoints() -> None: + pipeline = CodeIndexingPipeline() + content = """ +from fastapi import APIRouter + +router = APIRouter() + +class UserService: + def get_user(self, user_id): + return user_id + +@router.get("/users/{user_id}") +async def get_user(user_id: str): + service = UserService() + return service.get_user(user_id) +""" + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="app/api/users.py", + content=content, + ) + + layers = {doc.layer for doc in docs} + assert RagLayer.CODE_SOURCE_CHUNKS in layers + assert RagLayer.CODE_SYMBOL_CATALOG in layers + assert RagLayer.CODE_DEPENDENCY_GRAPH in layers + assert RagLayer.CODE_ENTRYPOINTS in layers + + symbol_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["kind"] == "function") + assert "get_user" in symbol_doc.metadata["qname"] + + edge_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_DEPENDENCY_GRAPH) + assert edge_doc.metadata["edge_type"] in {"calls", "imports", "inherits"} + + entry_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_ENTRYPOINTS) + assert entry_doc.metadata["framework"] == "fastapi" + + +def test_code_pipeline_indexes_import_alias_as_symbol() -> None: + pipeline = CodeIndexingPipeline() + content = "from .v2 import ConfigManagerV2 as ConfigManager\n" + + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="src/config_manager/__init__.py", + content=content, + ) + + alias_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["qname"] == "ConfigManager") + assert alias_doc.metadata["kind"] == "const" + assert alias_doc.metadata["lang_payload"]["import_alias"] is True diff --git a/tests/rag/test_docs_indexing_pipeline.py b/tests/rag/test_docs_indexing_pipeline.py new file mode 100644 index 0000000..675d3aa --- /dev/null +++ b/tests/rag/test_docs_indexing_pipeline.py @@ -0,0 +1,63 @@ +from app.modules.rag.contracts.enums import RagLayer +from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline + + +def test_docs_pipeline_builds_catalog_facts_sections_and_policy() -> None: + pipeline = DocsIndexingPipeline() + content = """--- +id: api.billing.create_invoice +type: policy +domain: billing +links: + calls_api: + - api.billing.validate_invoice +tags: [billing] +status: active +--- +# Create Invoice + +## Spec Summary + +Creates an invoice in billing. + +## Request Contract + +| field | type | required | validation | +| --- | --- | --- | --- | +| amount | decimal | yes | > 0 | + +## Error Matrix + +| status | error | client action | +| --- | --- | --- | +| 400 | invalid_amount | fix request | + +## Rules + +- metric: billing.invoice.created +- rule: amount must be positive +""" + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="docs/billing/create_invoice.md", + content=content, + ) + + layers = {doc.layer for doc in docs} + assert RagLayer.DOCS_MODULE_CATALOG in layers + assert RagLayer.DOCS_FACT_INDEX in layers + assert RagLayer.DOCS_SECTION_INDEX in layers + assert RagLayer.DOCS_POLICY_INDEX in layers + + module_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_MODULE_CATALOG) + assert module_doc.metadata["module_id"] == "api.billing.create_invoice" + assert module_doc.metadata["type"] == "policy" + + fact_texts = [doc.text for doc in docs if doc.layer == RagLayer.DOCS_FACT_INDEX] + assert any("calls_api" in text for text in fact_texts) + assert any("has_field" in text for text in fact_texts) + assert any("returns_error" in text for text in fact_texts) + + section_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_SECTION_INDEX) + assert section_doc.metadata["section_path"] diff --git a/tests/rag/test_query_router.py b/tests/rag/test_query_router.py new file mode 100644 index 0000000..8da84d7 --- /dev/null +++ b/tests/rag/test_query_router.py @@ -0,0 +1,12 @@ +from app.modules.rag.contracts.enums import RetrievalMode +from app.modules.rag.retrieval.query_router import RagQueryRouter + + +def test_query_router_uses_docs_by_default() -> None: + router = RagQueryRouter() + assert router.resolve_mode("Какие есть требования по биллингу?") == RetrievalMode.DOCS + + +def test_query_router_switches_to_code_on_explicit_code_requests() -> None: + router = RagQueryRouter() + assert router.resolve_mode("Объясни как работает код endpoint create invoice") == RetrievalMode.CODE diff --git a/tests/rag/test_query_terms.py b/tests/rag/test_query_terms.py new file mode 100644 index 0000000..654ec3c --- /dev/null +++ b/tests/rag/test_query_terms.py @@ -0,0 +1,9 @@ +from app.modules.rag.retrieval.query_terms import extract_query_terms + + +def test_extract_query_terms_from_code_question() -> None: + terms = extract_query_terms("Объясни по коду как можно управлять COnfigmanager?") + + assert "configmanager" in terms + assert "config_manager" in terms + assert "control" in terms