Новый раг

2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions
--- a/app/modules/rag/indexing/code/pycache/file_filter.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/pycache/file_filter.cpython-312.pyc
--- a/app/modules/rag/indexing/code/pycache/pipeline.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/pycache/pipeline.cpython-312.pyc
--- a/app/modules/rag/indexing/code/code_text/pycache/chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/code_text/pycache/chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/code/code_text/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/code_text/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/code/code_text/chunker.py
+++ b/app/modules/rag/indexing/code/code_text/chunker.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+
+@dataclass(slots=True)
+class CodeChunk:
+    title: str
+    text: str
+    start_line: int
+    end_line: int
+    chunk_type: str
+
+
+class CodeTextChunker:
+    def chunk(self, path: str, text: str) -> list[CodeChunk]:
+        try:
+            tree = ast.parse(text)
+        except SyntaxError:
+            return self._window_chunks(path, text)
+        chunks: list[CodeChunk] = []
+        lines = text.splitlines()
+        for node in tree.body:
+            if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
+                continue
+            start = int(getattr(node, "lineno", 1))
+            end = int(getattr(node, "end_lineno", start))
+            body = "\n".join(lines[start - 1 : end]).strip()
+            if not body:
+                continue
+            chunks.append(
+                CodeChunk(
+                    title=f"{path}:{getattr(node, 'name', 'block')}",
+                    text=body,
+                    start_line=start,
+                    end_line=end,
+                    chunk_type="symbol_block",
+                )
+            )
+        return chunks or self._window_chunks(path, text)
+
+    def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
+        lines = text.splitlines()
+        chunks: list[CodeChunk] = []
+        size = 80
+        overlap = 15
+        start = 0
+        while start < len(lines):
+            end = min(len(lines), start + size)
+            body = "\n".join(lines[start:end]).strip()
+            if body:
+                chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
+            if end >= len(lines):
+                break
+            start = max(0, end - overlap)
+        return chunks
--- a/app/modules/rag/indexing/code/code_text/document_builder.py
+++ b/app/modules/rag/indexing/code/code_text/document_builder.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
+from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
+
+
+class CodeTextDocumentBuilder:
+    def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
+        return RagDocument(
+            layer=RagLayer.CODE_SOURCE_CHUNKS,
+            lang="python",
+            source=source,
+            title=chunk.title,
+            text=chunk.text,
+            span=RagSpan(chunk.start_line, chunk.end_line),
+            metadata={
+                "chunk_index": chunk_index,
+                "chunk_type": chunk.chunk_type,
+                "module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
+                "artifact_type": "CODE",
+            },
+        )
--- a/app/modules/rag/indexing/code/edges/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/edges/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/code/edges/pycache/extractor.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/edges/pycache/extractor.cpython-312.pyc
--- a/app/modules/rag/indexing/code/edges/document_builder.py
+++ b/app/modules/rag/indexing/code/edges/document_builder.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
+from app.modules.rag.indexing.code.edges.extractor import PyEdge
+
+
+class EdgeDocumentBuilder:
+    def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
+        dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
+        return RagDocument(
+            layer=RagLayer.CODE_DEPENDENCY_GRAPH,
+            lang="python",
+            source=source,
+            title=f"{edge.src_qname}:{edge.edge_type}",
+            text=f"{edge.src_qname} {edge.edge_type} {dst}",
+            span=RagSpan(edge.start_line, edge.end_line),
+            metadata={
+                "edge_id": edge.edge_id,
+                "edge_type": edge.edge_type,
+                "src_symbol_id": edge.src_symbol_id,
+                "src_qname": edge.src_qname,
+                "dst_symbol_id": edge.dst_symbol_id,
+                "dst_ref": edge.dst_ref,
+                "resolution": edge.resolution,
+                "lang_payload": edge.metadata,
+                "artifact_type": "CODE",
+            },
+            links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
+        )
--- a/app/modules/rag/indexing/code/edges/extractor.py
+++ b/app/modules/rag/indexing/code/edges/extractor.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass, field
+from hashlib import sha256
+
+
+@dataclass(slots=True)
+class PyEdge:
+    edge_id: str
+    edge_type: str
+    src_symbol_id: str
+    src_qname: str
+    dst_symbol_id: str | None
+    dst_ref: str | None
+    path: str
+    start_line: int
+    end_line: int
+    resolution: str = "partial"
+    metadata: dict = field(default_factory=dict)
+
+
+class EdgeExtractor:
+    def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
+        if ast_tree is None:
+            return []
+        qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
+        visitor = _EdgeVisitor(path, qname_map)
+        visitor.visit(ast_tree)
+        return visitor.edges
+
+
+class _EdgeVisitor(ast.NodeVisitor):
+    def __init__(self, path: str, qname_map: dict[str, str]) -> None:
+        self._path = path
+        self._qname_map = qname_map
+        self._scope: list[str] = []
+        self.edges: list[PyEdge] = []
+
+    def visit_ClassDef(self, node: ast.ClassDef) -> None:
+        current = self._enter(node.name)
+        for base in node.bases:
+            self._add_edge("inherits", current, self._name(base), base)
+        self.generic_visit(node)
+        self._scope.pop()
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        self._visit_function(node)
+
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
+        self._visit_function(node)
+
+    def visit_Import(self, node: ast.Import) -> None:
+        current = self._current_qname()
+        if not current:
+            return
+        for item in node.names:
+            self._add_edge("imports", current, item.name, node)
+
+    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+        current = self._current_qname()
+        if not current:
+            return
+        module = node.module or ""
+        for item in node.names:
+            self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
+
+    def _visit_function(self, node) -> None:
+        current = self._enter(node.name)
+        for inner in ast.walk(node):
+            if isinstance(inner, ast.Call):
+                self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
+        self.generic_visit(node)
+        self._scope.pop()
+
+    def _enter(self, name: str) -> str:
+        self._scope.append(name)
+        return self._current_qname() or name
+
+    def _current_qname(self) -> str | None:
+        if not self._scope:
+            return None
+        return ".".join(self._scope)
+
+    def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
+        if not dst_ref:
+            return
+        src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
+        dst_symbol_id = self._qname_map.get(dst_ref)
+        edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
+        self.edges.append(
+            PyEdge(
+                edge_id=edge_id,
+                edge_type=edge_type,
+                src_symbol_id=src_symbol_id,
+                src_qname=src_qname,
+                dst_symbol_id=dst_symbol_id,
+                dst_ref=dst_ref,
+                path=self._path,
+                start_line=int(getattr(node, "lineno", 1)),
+                end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
+                resolution="resolved" if dst_symbol_id else "partial",
+                metadata=extra or {},
+            )
+        )
+
+    def _name(self, node) -> str:
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Attribute):
+            return f"{self._name(node.value)}.{node.attr}"
+        if isinstance(node, ast.Call):
+            return self._name(node.func)
+        return ""
--- a/app/modules/rag/indexing/code/entrypoints/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/entrypoints/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/code/entrypoints/pycache/fastapi_detector.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/entrypoints/pycache/fastapi_detector.cpython-312.pyc
--- a/app/modules/rag/indexing/code/entrypoints/pycache/flask_detector.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/entrypoints/pycache/flask_detector.cpython-312.pyc
--- a/app/modules/rag/indexing/code/entrypoints/pycache/registry.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/entrypoints/pycache/registry.cpython-312.pyc
--- a/app/modules/rag/indexing/code/entrypoints/pycache/typer_click_detector.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/entrypoints/pycache/typer_click_detector.cpython-312.pyc
--- a/app/modules/rag/indexing/code/entrypoints/document_builder.py
+++ b/app/modules/rag/indexing/code/entrypoints/document_builder.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
+from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
+
+
+class EntrypointDocumentBuilder:
+    def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
+        return RagDocument(
+            layer=RagLayer.CODE_ENTRYPOINTS,
+            lang="python",
+            source=source,
+            title=entrypoint.route_or_command,
+            text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
+            span=RagSpan(entrypoint.start_line, entrypoint.end_line),
+            metadata={
+                "entry_id": entrypoint.entry_id,
+                "entry_type": entrypoint.entry_type,
+                "framework": entrypoint.framework,
+                "route_or_command": entrypoint.route_or_command,
+                "handler_symbol_id": entrypoint.handler_symbol_id,
+                "lang_payload": entrypoint.metadata,
+                "artifact_type": "CODE",
+            },
+            links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
+        )
--- a/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py
+++ b/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from hashlib import sha256
+
+from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
+
+
+class FastApiEntrypointDetector:
+    _METHODS = {"get", "post", "put", "patch", "delete"}
+
+    def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
+        items: list[Entrypoint] = []
+        for symbol in symbols:
+            decorators = symbol.decorators or []
+            for decorator in decorators:
+                name = decorator.lower()
+                tail = name.split(".")[-1]
+                if tail not in self._METHODS and ".route" not in name:
+                    continue
+                route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
+                items.append(
+                    Entrypoint(
+                        entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
+                        entry_type="http",
+                        framework="fastapi",
+                        route_or_command=route,
+                        handler_symbol_id=symbol.symbol_id,
+                        path=path,
+                        start_line=symbol.start_line,
+                        end_line=symbol.end_line,
+                        metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
+                    )
+                )
+        return items
--- a/app/modules/rag/indexing/code/entrypoints/flask_detector.py
+++ b/app/modules/rag/indexing/code/entrypoints/flask_detector.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from hashlib import sha256
+
+from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
+
+
+class FlaskEntrypointDetector:
+    def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
+        items: list[Entrypoint] = []
+        for symbol in symbols:
+            for decorator in symbol.decorators or []:
+                lowered = decorator.lower()
+                if ".route" not in lowered:
+                    continue
+                items.append(
+                    Entrypoint(
+                        entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
+                        entry_type="http",
+                        framework="flask",
+                        route_or_command=decorator,
+                        handler_symbol_id=symbol.symbol_id,
+                        path=path,
+                        start_line=symbol.start_line,
+                        end_line=symbol.end_line,
+                    )
+                )
+        return items
--- a/app/modules/rag/indexing/code/entrypoints/registry.py
+++ b/app/modules/rag/indexing/code/entrypoints/registry.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(slots=True)
+class Entrypoint:
+    entry_id: str
+    entry_type: str
+    framework: str
+    route_or_command: str
+    handler_symbol_id: str
+    path: str
+    start_line: int
+    end_line: int
+    metadata: dict = field(default_factory=dict)
+
+
+class EntrypointDetectorRegistry:
+    def __init__(self, detectors: list) -> None:
+        self._detectors = detectors
+
+    def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
+        items: list[Entrypoint] = []
+        for detector in self._detectors:
+            items.extend(detector.detect(path=path, symbols=symbols))
+        return items
--- a/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py
+++ b/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from hashlib import sha256
+
+from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
+
+
+class TyperClickEntrypointDetector:
+    def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
+        items: list[Entrypoint] = []
+        for symbol in symbols:
+            for decorator in symbol.decorators or []:
+                lowered = decorator.lower()
+                if ".command" not in lowered and ".callback" not in lowered:
+                    continue
+                framework = "typer" if "typer" in lowered else "click"
+                items.append(
+                    Entrypoint(
+                        entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
+                        entry_type="cli",
+                        framework=framework,
+                        route_or_command=decorator,
+                        handler_symbol_id=symbol.symbol_id,
+                        path=path,
+                        start_line=symbol.start_line,
+                        end_line=symbol.end_line,
+                    )
+                )
+        return items
--- a/app/modules/rag/indexing/code/file_filter.py
+++ b/app/modules/rag/indexing/code/file_filter.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+
+
+class PythonFileFilter:
+    _EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
+
+    def should_index(self, path: str) -> bool:
+        candidate = PurePosixPath(path)
+        if candidate.suffix.lower() != ".py":
+            return False
+        return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)
--- a/app/modules/rag/indexing/code/pipeline.py
+++ b/app/modules/rag/indexing/code/pipeline.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import RagDocument, RagSource
+from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
+from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
+from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
+from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
+from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
+from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
+from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
+from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
+from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
+from app.modules.rag.indexing.code.file_filter import PythonFileFilter
+from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
+from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
+from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
+
+
+class CodeIndexingPipeline:
+    def __init__(self) -> None:
+        self._filter = PythonFileFilter()
+        self._chunker = CodeTextChunker()
+        self._code_builder = CodeTextDocumentBuilder()
+        self._parser = PythonAstParser()
+        self._symbols = SymbolExtractor()
+        self._symbol_builder = SymbolDocumentBuilder()
+        self._edges = EdgeExtractor()
+        self._edge_builder = EdgeDocumentBuilder()
+        self._entrypoints = EntrypointDetectorRegistry(
+            [FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
+        )
+        self._entrypoint_builder = EntrypointDocumentBuilder()
+
+    def supports(self, path: str) -> bool:
+        return self._filter.should_index(path)
+
+    def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
+        source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
+        docs: list[RagDocument] = []
+        code_chunks = self._chunker.chunk(path, content)
+        for index, chunk in enumerate(code_chunks):
+            docs.append(self._code_builder.build(source, chunk, chunk_index=index))
+        tree = self._parser.parse_module(content)
+        symbols = self._symbols.extract(path, content, tree)
+        for symbol in symbols:
+            docs.append(self._symbol_builder.build(source, symbol))
+        edges = self._edges.extract(path, tree, symbols)
+        for edge in edges:
+            docs.append(self._edge_builder.build(source, edge))
+        for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
+            docs.append(self._entrypoint_builder.build(source, entrypoint))
+        return docs
--- a/app/modules/rag/indexing/code/symbols/pycache/ast_parser.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/symbols/pycache/ast_parser.cpython-312.pyc
--- a/app/modules/rag/indexing/code/symbols/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/symbols/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/code/symbols/pycache/extractor.cpython-312.pyc
+++ b/app/modules/rag/indexing/code/symbols/pycache/extractor.cpython-312.pyc
--- a/app/modules/rag/indexing/code/symbols/ast_parser.py
+++ b/app/modules/rag/indexing/code/symbols/ast_parser.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+import ast
+
+
+class PythonAstParser:
+    def parse_module(self, text: str) -> ast.AST | None:
+        try:
+            return ast.parse(text)
+        except SyntaxError:
+            return None
--- a/app/modules/rag/indexing/code/symbols/document_builder.py
+++ b/app/modules/rag/indexing/code/symbols/document_builder.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
+from app.modules.rag.indexing.code.symbols.extractor import PySymbol
+
+
+class SymbolDocumentBuilder:
+    def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
+        body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
+        if symbol.docstring:
+            body.append(symbol.docstring.strip())
+        return RagDocument(
+            layer=RagLayer.CODE_SYMBOL_CATALOG,
+            lang="python",
+            source=source,
+            title=symbol.qname,
+            text="\n".join(part for part in body if part),
+            span=RagSpan(symbol.start_line, symbol.end_line),
+            metadata={
+                "symbol_id": symbol.symbol_id,
+                "qname": symbol.qname,
+                "kind": symbol.kind,
+                "signature": symbol.signature,
+                "decorators_or_annotations": symbol.decorators,
+                "docstring_or_javadoc": symbol.docstring,
+                "parent_symbol_id": symbol.parent_symbol_id,
+                "package_or_module": source.path.replace("/", ".").removesuffix(".py"),
+                "is_entry_candidate": bool(symbol.decorators),
+                "lang_payload": symbol.lang_payload,
+                "artifact_type": "CODE",
+            },
+        )
--- a/app/modules/rag/indexing/code/symbols/extractor.py
+++ b/app/modules/rag/indexing/code/symbols/extractor.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass, field
+from hashlib import sha256
+
+
+@dataclass(slots=True)
+class PySymbol:
+    symbol_id: str
+    qname: str
+    kind: str
+    path: str
+    start_line: int
+    end_line: int
+    signature: str
+    decorators: list[str] = field(default_factory=list)
+    docstring: str | None = None
+    parent_symbol_id: str | None = None
+    lang_payload: dict = field(default_factory=dict)
+
+
+class SymbolExtractor:
+    def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
+        if ast_tree is None:
+            return []
+        collector = _SymbolVisitor(path)
+        collector.visit(ast_tree)
+        return collector.symbols
+
+
+class _SymbolVisitor(ast.NodeVisitor):
+    def __init__(self, path: str) -> None:
+        self._path = path
+        self._stack: list[tuple[str, str]] = []
+        self.symbols: list[PySymbol] = []
+
+    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+        if self._stack:
+            return
+        module = node.module or ""
+        for item in node.names:
+            local_name = item.asname or item.name
+            imported_name = f"{module}.{item.name}".strip(".")
+            self.symbols.append(
+                PySymbol(
+                    symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
+                    qname=local_name,
+                    kind="const",
+                    path=self._path,
+                    start_line=int(getattr(node, "lineno", 1)),
+                    end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
+                    signature=f"{local_name} = {imported_name}",
+                    lang_payload={"imported_from": imported_name, "import_alias": True},
+                )
+            )
+        self.generic_visit(node)
+
+    def visit_Import(self, node: ast.Import) -> None:
+        if self._stack:
+            return
+        for item in node.names:
+            local_name = item.asname or item.name
+            self.symbols.append(
+                PySymbol(
+                    symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
+                    qname=local_name,
+                    kind="const",
+                    path=self._path,
+                    start_line=int(getattr(node, "lineno", 1)),
+                    end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
+                    signature=f"import {item.name}",
+                    lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
+                )
+            )
+        self.generic_visit(node)
+
+    def visit_ClassDef(self, node: ast.ClassDef) -> None:
+        self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
+        self.generic_visit(node)
+        self._stack.pop()
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        self._add_function(node, is_async=False)
+
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
+        self._add_function(node, is_async=True)
+
+    def _add_function(self, node, *, is_async: bool) -> None:
+        kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
+        self._add_symbol(node, kind, {"async": is_async})
+        self.generic_visit(node)
+        self._stack.pop()
+
+    def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
+        names = [name for _, name in self._stack] + [node.name]
+        qname = ".".join(names)
+        symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
+        signature = self._signature(node)
+        symbol = PySymbol(
+            symbol_id=symbol_id,
+            qname=qname,
+            kind=kind,
+            path=self._path,
+            start_line=int(getattr(node, "lineno", 1)),
+            end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
+            signature=signature,
+            decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
+            docstring=ast.get_docstring(node),
+            parent_symbol_id=self._stack[-1][1] if self._stack else None,
+            lang_payload=lang_payload,
+        )
+        self.symbols.append(symbol)
+        self._stack.append((kind, qname))
+
+    def _signature(self, node) -> str:
+        if isinstance(node, ast.ClassDef):
+            bases = ", ".join(self._expr_name(base) for base in node.bases)
+            return f"{node.name}({bases})" if bases else node.name
+        args = [arg.arg for arg in getattr(node.args, "args", [])]
+        return f"{node.name}({', '.join(args)})"
+
+    def _expr_name(self, node) -> str:
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Attribute):
+            return f"{self._expr_name(node.value)}.{node.attr}"
+        if isinstance(node, ast.Call):
+            return self._expr_name(node.func)
+        return ast.dump(node, include_attributes=False)
--- a/app/modules/rag/indexing/common/document_upserter.py
+++ b/app/modules/rag/indexing/common/document_upserter.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from app.modules.rag.contracts import RagDocument
+from app.modules.rag.persistence.repository import RagRepository
+
+
+class RagDocumentUpserter:
+    def __init__(self, repository: RagRepository) -> None:
+        self._repository = repository
+
+    def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None:
+        self._repository.replace_documents(rag_session_id, docs)
+
+    def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
+        self._repository.apply_document_changes(rag_session_id, delete_paths, docs)
--- a/app/modules/rag/indexing/common/report.py
+++ b/app/modules/rag/indexing/common/report.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(slots=True)
+class IndexReport:
+    indexed_files: int = 0
+    failed_files: int = 0
+    cache_hit_files: int = 0
+    cache_miss_files: int = 0
+    documents: int = 0
+    warnings: list[str] = field(default_factory=list)
+
+    def as_tuple(self) -> tuple[int, int, int, int]:
+        return (
+            self.indexed_files,
+            self.failed_files,
+            self.cache_hit_files,
+            self.cache_miss_files,
+        )
--- a/app/modules/rag/indexing/docs/pycache/classifier.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/classifier.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/file_filter.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/file_filter.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/pipeline.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/pipeline.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
+
+
+@dataclass(slots=True)
+class SectionChunk:
+    section_path: str
+    section_title: str
+    content: str
+    order: int
+
+
+class MarkdownDocChunker:
+    def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
+        self._fallback = text_chunker or DocTextChunker()
+
+    def chunk(self, text: str) -> list[SectionChunk]:
+        lines = text.splitlines()
+        sections: list[SectionChunk] = []
+        stack: list[tuple[int, str]] = []
+        current_title = "Document"
+        current_lines: list[str] = []
+        order = 0
+        for line in lines:
+            heading = self._heading(line)
+            if heading is None:
+                current_lines.append(line)
+                continue
+            self._flush_section(sections, stack, current_title, current_lines, order)
+            order += 1
+            level, title = heading
+            stack = [item for item in stack if item[0] < level]
+            stack.append((level, title))
+            current_title = title
+            current_lines = []
+        self._flush_section(sections, stack, current_title, current_lines, order)
+        if sections:
+            return sections
+        chunks = self._fallback.split(text)
+        return [
+            SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
+            for index, chunk in enumerate(chunks)
+        ]
+
+    def _flush_section(
+        self,
+        sections: list[SectionChunk],
+        stack: list[tuple[int, str]],
+        current_title: str,
+        current_lines: list[str],
+        order: int,
+    ) -> None:
+        content = "\n".join(current_lines).strip()
+        if not content:
+            return
+        titles = [title for _, title in stack] or [current_title]
+        sections.append(
+            SectionChunk(
+                section_path=" > ".join(titles),
+                section_title=titles[-1],
+                content=content,
+                order=order,
+            )
+        )
+
+    def _heading(self, line: str) -> tuple[int, str] | None:
+        stripped = line.strip()
+        if not stripped.startswith("#"):
+            return None
+        level = len(stripped) - len(stripped.lstrip("#"))
+        title = stripped[level:].strip()
+        if not title:
+            return None
+        return level, title
--- a/app/modules/rag/indexing/docs/chunkers/text_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/text_chunker.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+
+class DocTextChunker:
+    def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
+        self._max_chars = max_chars
+        self._overlap_chars = overlap_chars
+
+    def split(self, text: str) -> list[str]:
+        cleaned = text.strip()
+        if not cleaned:
+            return []
+        chunks: list[str] = []
+        start = 0
+        while start < len(cleaned):
+            end = min(len(cleaned), start + self._max_chars)
+            chunks.append(cleaned[start:end].strip())
+            if end >= len(cleaned):
+                break
+            start = max(0, end - self._overlap_chars)
+        return [chunk for chunk in chunks if chunk]
--- a/app/modules/rag/indexing/docs/classifier.py
+++ b/app/modules/rag/indexing/docs/classifier.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+
+from app.modules.rag.contracts import DocKind
+
+
+class DocsClassifier:
+    def classify(self, path: str) -> str:
+        upper = PurePosixPath(path).name.upper()
+        lowered = path.lower()
+        if "runbook" in lowered or upper.startswith("RUNBOOK"):
+            return DocKind.RUNBOOK
+        if upper.startswith("README"):
+            return DocKind.README
+        if "spec" in lowered or "architecture" in lowered:
+            return DocKind.SPEC
+        return DocKind.MISC
--- a/app/modules/rag/indexing/docs/document_builder.py
+++ b/app/modules/rag/indexing/docs/document_builder.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from hashlib import sha256
+
+from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
+from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
+
+
+class DocsDocumentBuilder:
+    def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
+        module_id = str(frontmatter.get("id") or "").strip()
+        module_type = str(frontmatter.get("type") or "").strip()
+        domain = str(frontmatter.get("domain") or "").strip()
+        if not module_id or not module_type or not domain:
+            return None
+        links = frontmatter.get("links") or {}
+        metadata = {
+            "module_id": module_id,
+            "type": module_type,
+            "domain": domain,
+            "status": frontmatter.get("status"),
+            "version": frontmatter.get("version"),
+            "tags": frontmatter.get("tags") or [],
+            "owners": frontmatter.get("owners") or [],
+            "links": links,
+            "source_path": source.path,
+            "summary_text": summary_text[:4000],
+            "doc_kind": doc_kind,
+        }
+        metadata.update({name: links.get(name, []) for name in (
+            "calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
+            "integrates_with", "emits_events", "consumes_events",
+        )})
+        return RagDocument(
+            layer=RagLayer.DOCS_MODULE_CATALOG,
+            source=source,
+            title=module_id,
+            text=summary_text[:4000] or module_id,
+            metadata=metadata,
+        )
+
+    def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
+        module_id = str(frontmatter.get("id") or source.path)
+        metadata = {
+            "module_id": module_id,
+            "type": frontmatter.get("type"),
+            "domain": frontmatter.get("domain"),
+            "tags": frontmatter.get("tags") or [],
+            "section_path": chunk.section_path,
+            "section_title": chunk.section_title,
+            "order": chunk.order,
+            "doc_kind": doc_kind,
+            "source_path": source.path,
+            "artifact_type": "DOCS",
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_SECTION_INDEX,
+            source=source,
+            title=f"{module_id}:{chunk.section_title}",
+            text=chunk.content,
+            metadata=metadata,
+        )
+
+    def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
+        policy_id = str(frontmatter.get("id") or "").strip()
+        applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
+        if not policy_id:
+            return None
+        metadata = {
+            "policy_id": policy_id,
+            "applies_to": applies_to,
+            "rules": chunk.content[:4000],
+            "default_behaviors": frontmatter.get("default_behaviors") or [],
+            "doc_kind": doc_kind,
+            "section_path": chunk.section_path,
+            "source_path": source.path,
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_POLICY_INDEX,
+            source=source,
+            title=policy_id,
+            text=chunk.content[:4000],
+            metadata=metadata,
+        )
+
+    def build_fact(
+        self,
+        source: RagSource,
+        *,
+        subject_id: str,
+        predicate: str,
+        obj: str,
+        object_ref: str | None,
+        anchor: str,
+        tags: list[str] | None = None,
+    ) -> RagDocument:
+        fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
+        metadata = {
+            "fact_id": fact_id,
+            "subject_id": subject_id,
+            "predicate": predicate,
+            "object": obj,
+            "object_ref": object_ref,
+            "anchor": anchor,
+            "tags": tags or [],
+            "source_path": source.path,
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_FACT_INDEX,
+            source=source,
+            title=f"{subject_id}:{predicate}",
+            text=f"{subject_id} {predicate} {obj}".strip(),
+            metadata=metadata,
+            links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
+        )
--- a/app/modules/rag/indexing/docs/file_filter.py
+++ b/app/modules/rag/indexing/docs/file_filter.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+
+
+class DocsFileFilter:
+    _EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
+    _NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
+    _EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
+
+    def should_index(self, path: str) -> bool:
+        candidate = PurePosixPath(path)
+        if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
+            return False
+        if candidate.suffix.lower() in self._EXTENSIONS:
+            return True
+        upper_name = candidate.name.upper()
+        if any(upper_name.startswith(prefix) for prefix in self._NAMES):
+            return True
+        joined = "/".join(candidate.parts).lower()
+        return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))
--- a/app/modules/rag/indexing/docs/pipeline.py
+++ b/app/modules/rag/indexing/docs/pipeline.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import re
+
+import yaml
+
+from app.modules.rag.contracts import RagDocument, RagSource
+from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
+from app.modules.rag.indexing.docs.classifier import DocsClassifier
+from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
+from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
+
+
+class DocsIndexingPipeline:
+    def __init__(self) -> None:
+        self._filter = DocsFileFilter()
+        self._classifier = DocsClassifier()
+        self._chunker = MarkdownDocChunker()
+        self._builder = DocsDocumentBuilder()
+
+    def supports(self, path: str) -> bool:
+        return self._filter.should_index(path)
+
+    def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
+        source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
+        frontmatter, body = self._split_frontmatter(content)
+        doc_kind = self._classifier.classify(path)
+        sections = self._chunker.chunk(body)
+        summary_text = self._summary_from_sections(sections)
+        docs: list[RagDocument] = []
+        module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
+        if module_doc is not None:
+            docs.append(module_doc)
+        for section in sections:
+            docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
+        if str(frontmatter.get("type") or "").strip() == "policy":
+            for section in sections[:1]:
+                policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
+                if policy is not None:
+                    docs.append(policy)
+        docs.extend(self._extract_facts(source, frontmatter, sections))
+        return docs
+
+    def _split_frontmatter(self, content: str) -> tuple[dict, str]:
+        if not content.startswith("---\n"):
+            return {}, content
+        _, raw, body = content.split("---", 2)
+        payload = yaml.safe_load(raw) or {}
+        return payload if isinstance(payload, dict) else {}, body.strip()
+
+    def _summary_from_sections(self, sections) -> str:
+        text = "\n\n".join(section.content for section in sections[:2]).strip()
+        return text[:4000]
+
+    def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
+        subject_id = str(frontmatter.get("id") or source.path)
+        docs: list[RagDocument] = []
+        links = frontmatter.get("links") or {}
+        for predicate, values in links.items():
+            for value in values or []:
+                docs.append(
+                    self._builder.build_fact(
+                        source,
+                        subject_id=subject_id,
+                        predicate=predicate,
+                        obj=str(value),
+                        object_ref=str(value),
+                        anchor="frontmatter.links",
+                    )
+                )
+        for section in sections:
+            docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
+            docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
+        return docs
+
+    def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
+        lines = [line.strip() for line in content.splitlines() if line.strip()]
+        if len(lines) < 3 or "|" not in lines[0]:
+            return []
+        headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
+        if not all(headers):
+            return []
+        docs: list[RagDocument] = []
+        for row in lines[2:]:
+            if "|" not in row:
+                continue
+            values = [part.strip() for part in row.strip("|").split("|")]
+            if len(values) != len(headers):
+                continue
+            payload = dict(zip(headers, values))
+            docs.extend(self._facts_from_row(source, subject_id, title, payload))
+        return docs
+
+    def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
+        docs: list[RagDocument] = []
+        name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
+        if "request" in title.lower() or "response" in title.lower():
+            if name:
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
+            if payload.get("required"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
+            if payload.get("type"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
+            if payload.get("validation"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
+        if "error" in title.lower():
+            if payload.get("status"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
+            if payload.get("error") or payload.get("code"):
+                error_value = payload.get("error") or payload.get("code")
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
+            if payload.get("client action"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
+        if "constraint" in title.lower() and name:
+            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
+        return docs
+
+    def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
+        docs: list[RagDocument] = []
+        for line in content.splitlines():
+            item = line.strip()
+            if not item.startswith(("-", "*", "1.", "2.", "3.")):
+                continue
+            normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
+            lowered = normalized.lower()
+            if lowered.startswith("metric:"):
+                predicate = "emits_metric"
+            elif lowered.startswith("event:"):
+                predicate = "emits_analytics_event"
+            elif lowered.startswith("log:"):
+                predicate = "logs_event"
+            else:
+                predicate = "validates_rule" if "rule" in title.lower() else "client_action"
+            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
+        return docs