Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import ast
from dataclasses import dataclass
@dataclass(slots=True)
class CodeChunk:
title: str
text: str
start_line: int
end_line: int
chunk_type: str
class CodeTextChunker:
def chunk(self, path: str, text: str) -> list[CodeChunk]:
try:
tree = ast.parse(text)
except SyntaxError:
return self._window_chunks(path, text)
chunks: list[CodeChunk] = []
lines = text.splitlines()
for node in tree.body:
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
continue
start = int(getattr(node, "lineno", 1))
end = int(getattr(node, "end_lineno", start))
body = "\n".join(lines[start - 1 : end]).strip()
if not body:
continue
chunks.append(
CodeChunk(
title=f"{path}:{getattr(node, 'name', 'block')}",
text=body,
start_line=start,
end_line=end,
chunk_type="symbol_block",
)
)
return chunks or self._window_chunks(path, text)
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
lines = text.splitlines()
chunks: list[CodeChunk] = []
size = 80
overlap = 15
start = 0
while start < len(lines):
end = min(len(lines), start + size)
body = "\n".join(lines[start:end]).strip()
if body:
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
if end >= len(lines):
break
start = max(0, end - overlap)
return chunks

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
class CodeTextDocumentBuilder:
def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_SOURCE_CHUNKS,
lang="python",
source=source,
title=chunk.title,
text=chunk.text,
span=RagSpan(chunk.start_line, chunk.end_line),
metadata={
"chunk_index": chunk_index,
"chunk_type": chunk.chunk_type,
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.edges.extractor import PyEdge
class EdgeDocumentBuilder:
def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
return RagDocument(
layer=RagLayer.CODE_DEPENDENCY_GRAPH,
lang="python",
source=source,
title=f"{edge.src_qname}:{edge.edge_type}",
text=f"{edge.src_qname} {edge.edge_type} {dst}",
span=RagSpan(edge.start_line, edge.end_line),
metadata={
"edge_id": edge.edge_id,
"edge_type": edge.edge_type,
"src_symbol_id": edge.src_symbol_id,
"src_qname": edge.src_qname,
"dst_symbol_id": edge.dst_symbol_id,
"dst_ref": edge.dst_ref,
"resolution": edge.resolution,
"lang_payload": edge.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
)

View File

@@ -0,0 +1,114 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PyEdge:
edge_id: str
edge_type: str
src_symbol_id: str
src_qname: str
dst_symbol_id: str | None
dst_ref: str | None
path: str
start_line: int
end_line: int
resolution: str = "partial"
metadata: dict = field(default_factory=dict)
class EdgeExtractor:
def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
if ast_tree is None:
return []
qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
visitor = _EdgeVisitor(path, qname_map)
visitor.visit(ast_tree)
return visitor.edges
class _EdgeVisitor(ast.NodeVisitor):
def __init__(self, path: str, qname_map: dict[str, str]) -> None:
self._path = path
self._qname_map = qname_map
self._scope: list[str] = []
self.edges: list[PyEdge] = []
def visit_ClassDef(self, node: ast.ClassDef) -> None:
current = self._enter(node.name)
for base in node.bases:
self._add_edge("inherits", current, self._name(base), base)
self.generic_visit(node)
self._scope.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._visit_function(node)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._visit_function(node)
def visit_Import(self, node: ast.Import) -> None:
current = self._current_qname()
if not current:
return
for item in node.names:
self._add_edge("imports", current, item.name, node)
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
current = self._current_qname()
if not current:
return
module = node.module or ""
for item in node.names:
self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
def _visit_function(self, node) -> None:
current = self._enter(node.name)
for inner in ast.walk(node):
if isinstance(inner, ast.Call):
self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
self.generic_visit(node)
self._scope.pop()
def _enter(self, name: str) -> str:
self._scope.append(name)
return self._current_qname() or name
def _current_qname(self) -> str | None:
if not self._scope:
return None
return ".".join(self._scope)
def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
if not dst_ref:
return
src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
dst_symbol_id = self._qname_map.get(dst_ref)
edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
self.edges.append(
PyEdge(
edge_id=edge_id,
edge_type=edge_type,
src_symbol_id=src_symbol_id,
src_qname=src_qname,
dst_symbol_id=dst_symbol_id,
dst_ref=dst_ref,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
resolution="resolved" if dst_symbol_id else "partial",
metadata=extra or {},
)
)
def _name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._name(node.func)
return ""

View File

@@ -0,0 +1,26 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class EntrypointDocumentBuilder:
def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_ENTRYPOINTS,
lang="python",
source=source,
title=entrypoint.route_or_command,
text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
span=RagSpan(entrypoint.start_line, entrypoint.end_line),
metadata={
"entry_id": entrypoint.entry_id,
"entry_type": entrypoint.entry_type,
"framework": entrypoint.framework,
"route_or_command": entrypoint.route_or_command,
"handler_symbol_id": entrypoint.handler_symbol_id,
"lang_payload": entrypoint.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
)

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FastApiEntrypointDetector:
_METHODS = {"get", "post", "put", "patch", "delete"}
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
decorators = symbol.decorators or []
for decorator in decorators:
name = decorator.lower()
tail = name.split(".")[-1]
if tail not in self._METHODS and ".route" not in name:
continue
route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
items.append(
Entrypoint(
entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="fastapi",
route_or_command=route,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
)
)
return items

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FlaskEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".route" not in lowered:
continue
items.append(
Entrypoint(
entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="flask",
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,27 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class Entrypoint:
entry_id: str
entry_type: str
framework: str
route_or_command: str
handler_symbol_id: str
path: str
start_line: int
end_line: int
metadata: dict = field(default_factory=dict)
class EntrypointDetectorRegistry:
def __init__(self, detectors: list) -> None:
self._detectors = detectors
def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for detector in self._detectors:
items.extend(detector.detect(path=path, symbols=symbols))
return items

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class TyperClickEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".command" not in lowered and ".callback" not in lowered:
continue
framework = "typer" if "typer" in lowered else "click"
items.append(
Entrypoint(
entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="cli",
framework=framework,
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
from pathlib import PurePosixPath
class PythonFileFilter:
_EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if candidate.suffix.lower() != ".py":
return False
return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)

View File

@@ -0,0 +1,52 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
class CodeIndexingPipeline:
def __init__(self) -> None:
self._filter = PythonFileFilter()
self._chunker = CodeTextChunker()
self._code_builder = CodeTextDocumentBuilder()
self._parser = PythonAstParser()
self._symbols = SymbolExtractor()
self._symbol_builder = SymbolDocumentBuilder()
self._edges = EdgeExtractor()
self._edge_builder = EdgeDocumentBuilder()
self._entrypoints = EntrypointDetectorRegistry(
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
)
self._entrypoint_builder = EntrypointDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
docs: list[RagDocument] = []
code_chunks = self._chunker.chunk(path, content)
for index, chunk in enumerate(code_chunks):
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
tree = self._parser.parse_module(content)
symbols = self._symbols.extract(path, content, tree)
for symbol in symbols:
docs.append(self._symbol_builder.build(source, symbol))
edges = self._edges.extract(path, tree, symbols)
for edge in edges:
docs.append(self._edge_builder.build(source, edge))
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
docs.append(self._entrypoint_builder.build(source, entrypoint))
return docs

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
import ast
class PythonAstParser:
def parse_module(self, text: str) -> ast.AST | None:
try:
return ast.parse(text)
except SyntaxError:
return None

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
class SymbolDocumentBuilder:
def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
if symbol.docstring:
body.append(symbol.docstring.strip())
return RagDocument(
layer=RagLayer.CODE_SYMBOL_CATALOG,
lang="python",
source=source,
title=symbol.qname,
text="\n".join(part for part in body if part),
span=RagSpan(symbol.start_line, symbol.end_line),
metadata={
"symbol_id": symbol.symbol_id,
"qname": symbol.qname,
"kind": symbol.kind,
"signature": symbol.signature,
"decorators_or_annotations": symbol.decorators,
"docstring_or_javadoc": symbol.docstring,
"parent_symbol_id": symbol.parent_symbol_id,
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
"is_entry_candidate": bool(symbol.decorators),
"lang_payload": symbol.lang_payload,
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,130 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PySymbol:
symbol_id: str
qname: str
kind: str
path: str
start_line: int
end_line: int
signature: str
decorators: list[str] = field(default_factory=list)
docstring: str | None = None
parent_symbol_id: str | None = None
lang_payload: dict = field(default_factory=dict)
class SymbolExtractor:
def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
if ast_tree is None:
return []
collector = _SymbolVisitor(path)
collector.visit(ast_tree)
return collector.symbols
class _SymbolVisitor(ast.NodeVisitor):
def __init__(self, path: str) -> None:
self._path = path
self._stack: list[tuple[str, str]] = []
self.symbols: list[PySymbol] = []
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
if self._stack:
return
module = node.module or ""
for item in node.names:
local_name = item.asname or item.name
imported_name = f"{module}.{item.name}".strip(".")
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"{local_name} = {imported_name}",
lang_payload={"imported_from": imported_name, "import_alias": True},
)
)
self.generic_visit(node)
def visit_Import(self, node: ast.Import) -> None:
if self._stack:
return
for item in node.names:
local_name = item.asname or item.name
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"import {item.name}",
lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
)
)
self.generic_visit(node)
def visit_ClassDef(self, node: ast.ClassDef) -> None:
self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
self.generic_visit(node)
self._stack.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._add_function(node, is_async=False)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._add_function(node, is_async=True)
def _add_function(self, node, *, is_async: bool) -> None:
kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
self._add_symbol(node, kind, {"async": is_async})
self.generic_visit(node)
self._stack.pop()
def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
names = [name for _, name in self._stack] + [node.name]
qname = ".".join(names)
symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
signature = self._signature(node)
symbol = PySymbol(
symbol_id=symbol_id,
qname=qname,
kind=kind,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=signature,
decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
docstring=ast.get_docstring(node),
parent_symbol_id=self._stack[-1][1] if self._stack else None,
lang_payload=lang_payload,
)
self.symbols.append(symbol)
self._stack.append((kind, qname))
def _signature(self, node) -> str:
if isinstance(node, ast.ClassDef):
bases = ", ".join(self._expr_name(base) for base in node.bases)
return f"{node.name}({bases})" if bases else node.name
args = [arg.arg for arg in getattr(node.args, "args", [])]
return f"{node.name}({', '.join(args)})"
def _expr_name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._expr_name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._expr_name(node.func)
return ast.dump(node, include_attributes=False)

View File

@@ -0,0 +1,15 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument
from app.modules.rag.persistence.repository import RagRepository
class RagDocumentUpserter:
def __init__(self, repository: RagRepository) -> None:
self._repository = repository
def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None:
self._repository.replace_documents(rag_session_id, docs)
def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None:
self._repository.apply_document_changes(rag_session_id, delete_paths, docs)

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class IndexReport:
indexed_files: int = 0
failed_files: int = 0
cache_hit_files: int = 0
cache_miss_files: int = 0
documents: int = 0
warnings: list[str] = field(default_factory=list)
def as_tuple(self) -> tuple[int, int, int, int]:
return (
self.indexed_files,
self.failed_files,
self.cache_hit_files,
self.cache_miss_files,
)

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import dataclass
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
@dataclass(slots=True)
class SectionChunk:
section_path: str
section_title: str
content: str
order: int
class MarkdownDocChunker:
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
self._fallback = text_chunker or DocTextChunker()
def chunk(self, text: str) -> list[SectionChunk]:
lines = text.splitlines()
sections: list[SectionChunk] = []
stack: list[tuple[int, str]] = []
current_title = "Document"
current_lines: list[str] = []
order = 0
for line in lines:
heading = self._heading(line)
if heading is None:
current_lines.append(line)
continue
self._flush_section(sections, stack, current_title, current_lines, order)
order += 1
level, title = heading
stack = [item for item in stack if item[0] < level]
stack.append((level, title))
current_title = title
current_lines = []
self._flush_section(sections, stack, current_title, current_lines, order)
if sections:
return sections
chunks = self._fallback.split(text)
return [
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
for index, chunk in enumerate(chunks)
]
def _flush_section(
self,
sections: list[SectionChunk],
stack: list[tuple[int, str]],
current_title: str,
current_lines: list[str],
order: int,
) -> None:
content = "\n".join(current_lines).strip()
if not content:
return
titles = [title for _, title in stack] or [current_title]
sections.append(
SectionChunk(
section_path=" > ".join(titles),
section_title=titles[-1],
content=content,
order=order,
)
)
def _heading(self, line: str) -> tuple[int, str] | None:
stripped = line.strip()
if not stripped.startswith("#"):
return None
level = len(stripped) - len(stripped.lstrip("#"))
title = stripped[level:].strip()
if not title:
return None
return level, title

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
class DocTextChunker:
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
self._max_chars = max_chars
self._overlap_chars = overlap_chars
def split(self, text: str) -> list[str]:
cleaned = text.strip()
if not cleaned:
return []
chunks: list[str] = []
start = 0
while start < len(cleaned):
end = min(len(cleaned), start + self._max_chars)
chunks.append(cleaned[start:end].strip())
if end >= len(cleaned):
break
start = max(0, end - self._overlap_chars)
return [chunk for chunk in chunks if chunk]

View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from pathlib import PurePosixPath
from app.modules.rag.contracts import DocKind
class DocsClassifier:
def classify(self, path: str) -> str:
upper = PurePosixPath(path).name.upper()
lowered = path.lower()
if "runbook" in lowered or upper.startswith("RUNBOOK"):
return DocKind.RUNBOOK
if upper.startswith("README"):
return DocKind.README
if "spec" in lowered or "architecture" in lowered:
return DocKind.SPEC
return DocKind.MISC

View File

@@ -0,0 +1,115 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
class DocsDocumentBuilder:
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
module_id = str(frontmatter.get("id") or "").strip()
module_type = str(frontmatter.get("type") or "").strip()
domain = str(frontmatter.get("domain") or "").strip()
if not module_id or not module_type or not domain:
return None
links = frontmatter.get("links") or {}
metadata = {
"module_id": module_id,
"type": module_type,
"domain": domain,
"status": frontmatter.get("status"),
"version": frontmatter.get("version"),
"tags": frontmatter.get("tags") or [],
"owners": frontmatter.get("owners") or [],
"links": links,
"source_path": source.path,
"summary_text": summary_text[:4000],
"doc_kind": doc_kind,
}
metadata.update({name: links.get(name, []) for name in (
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
"integrates_with", "emits_events", "consumes_events",
)})
return RagDocument(
layer=RagLayer.DOCS_MODULE_CATALOG,
source=source,
title=module_id,
text=summary_text[:4000] or module_id,
metadata=metadata,
)
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
module_id = str(frontmatter.get("id") or source.path)
metadata = {
"module_id": module_id,
"type": frontmatter.get("type"),
"domain": frontmatter.get("domain"),
"tags": frontmatter.get("tags") or [],
"section_path": chunk.section_path,
"section_title": chunk.section_title,
"order": chunk.order,
"doc_kind": doc_kind,
"source_path": source.path,
"artifact_type": "DOCS",
}
return RagDocument(
layer=RagLayer.DOCS_SECTION_INDEX,
source=source,
title=f"{module_id}:{chunk.section_title}",
text=chunk.content,
metadata=metadata,
)
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
policy_id = str(frontmatter.get("id") or "").strip()
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
if not policy_id:
return None
metadata = {
"policy_id": policy_id,
"applies_to": applies_to,
"rules": chunk.content[:4000],
"default_behaviors": frontmatter.get("default_behaviors") or [],
"doc_kind": doc_kind,
"section_path": chunk.section_path,
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_POLICY_INDEX,
source=source,
title=policy_id,
text=chunk.content[:4000],
metadata=metadata,
)
def build_fact(
self,
source: RagSource,
*,
subject_id: str,
predicate: str,
obj: str,
object_ref: str | None,
anchor: str,
tags: list[str] | None = None,
) -> RagDocument:
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
metadata = {
"fact_id": fact_id,
"subject_id": subject_id,
"predicate": predicate,
"object": obj,
"object_ref": object_ref,
"anchor": anchor,
"tags": tags or [],
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_FACT_INDEX,
source=source,
title=f"{subject_id}:{predicate}",
text=f"{subject_id} {predicate} {obj}".strip(),
metadata=metadata,
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
)

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from pathlib import PurePosixPath
class DocsFileFilter:
_EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
_NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
_EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
return False
if candidate.suffix.lower() in self._EXTENSIONS:
return True
upper_name = candidate.name.upper()
if any(upper_name.startswith(prefix) for prefix in self._NAMES):
return True
joined = "/".join(candidate.parts).lower()
return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))

View File

@@ -0,0 +1,135 @@
from __future__ import annotations
import re
import yaml
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.modules.rag.indexing.docs.classifier import DocsClassifier
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
class DocsIndexingPipeline:
def __init__(self) -> None:
self._filter = DocsFileFilter()
self._classifier = DocsClassifier()
self._chunker = MarkdownDocChunker()
self._builder = DocsDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
frontmatter, body = self._split_frontmatter(content)
doc_kind = self._classifier.classify(path)
sections = self._chunker.chunk(body)
summary_text = self._summary_from_sections(sections)
docs: list[RagDocument] = []
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
if module_doc is not None:
docs.append(module_doc)
for section in sections:
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
if str(frontmatter.get("type") or "").strip() == "policy":
for section in sections[:1]:
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
if policy is not None:
docs.append(policy)
docs.extend(self._extract_facts(source, frontmatter, sections))
return docs
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
if not content.startswith("---\n"):
return {}, content
_, raw, body = content.split("---", 2)
payload = yaml.safe_load(raw) or {}
return payload if isinstance(payload, dict) else {}, body.strip()
def _summary_from_sections(self, sections) -> str:
text = "\n\n".join(section.content for section in sections[:2]).strip()
return text[:4000]
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
subject_id = str(frontmatter.get("id") or source.path)
docs: list[RagDocument] = []
links = frontmatter.get("links") or {}
for predicate, values in links.items():
for value in values or []:
docs.append(
self._builder.build_fact(
source,
subject_id=subject_id,
predicate=predicate,
obj=str(value),
object_ref=str(value),
anchor="frontmatter.links",
)
)
for section in sections:
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
return docs
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
lines = [line.strip() for line in content.splitlines() if line.strip()]
if len(lines) < 3 or "|" not in lines[0]:
return []
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
if not all(headers):
return []
docs: list[RagDocument] = []
for row in lines[2:]:
if "|" not in row:
continue
values = [part.strip() for part in row.strip("|").split("|")]
if len(values) != len(headers):
continue
payload = dict(zip(headers, values))
docs.extend(self._facts_from_row(source, subject_id, title, payload))
return docs
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
docs: list[RagDocument] = []
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
if "request" in title.lower() or "response" in title.lower():
if name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
if payload.get("required"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
if payload.get("type"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
if payload.get("validation"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
if "error" in title.lower():
if payload.get("status"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
if payload.get("error") or payload.get("code"):
error_value = payload.get("error") or payload.get("code")
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
if payload.get("client action"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
if "constraint" in title.lower() and name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
return docs
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
docs: list[RagDocument] = []
for line in content.splitlines():
item = line.strip()
if not item.startswith(("-", "*", "1.", "2.", "3.")):
continue
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
lowered = normalized.lower()
if lowered.startswith("metric:"):
predicate = "emits_metric"
elif lowered.startswith("event:"):
predicate = "emits_analytics_event"
elif lowered.startswith("log:"):
predicate = "logs_event"
else:
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
return docs