Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import ast
from dataclasses import dataclass
@dataclass(slots=True)
class CodeChunk:
title: str
text: str
start_line: int
end_line: int
chunk_type: str
class CodeTextChunker:
def chunk(self, path: str, text: str) -> list[CodeChunk]:
try:
tree = ast.parse(text)
except SyntaxError:
return self._window_chunks(path, text)
chunks: list[CodeChunk] = []
lines = text.splitlines()
for node in tree.body:
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
continue
start = int(getattr(node, "lineno", 1))
end = int(getattr(node, "end_lineno", start))
body = "\n".join(lines[start - 1 : end]).strip()
if not body:
continue
chunks.append(
CodeChunk(
title=f"{path}:{getattr(node, 'name', 'block')}",
text=body,
start_line=start,
end_line=end,
chunk_type="symbol_block",
)
)
return chunks or self._window_chunks(path, text)
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
lines = text.splitlines()
chunks: list[CodeChunk] = []
size = 80
overlap = 15
start = 0
while start < len(lines):
end = min(len(lines), start + size)
body = "\n".join(lines[start:end]).strip()
if body:
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
if end >= len(lines):
break
start = max(0, end - overlap)
return chunks

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
class CodeTextDocumentBuilder:
def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_SOURCE_CHUNKS,
lang="python",
source=source,
title=chunk.title,
text=chunk.text,
span=RagSpan(chunk.start_line, chunk.end_line),
metadata={
"chunk_index": chunk_index,
"chunk_type": chunk.chunk_type,
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.edges.extractor import PyEdge
class EdgeDocumentBuilder:
def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
return RagDocument(
layer=RagLayer.CODE_DEPENDENCY_GRAPH,
lang="python",
source=source,
title=f"{edge.src_qname}:{edge.edge_type}",
text=f"{edge.src_qname} {edge.edge_type} {dst}",
span=RagSpan(edge.start_line, edge.end_line),
metadata={
"edge_id": edge.edge_id,
"edge_type": edge.edge_type,
"src_symbol_id": edge.src_symbol_id,
"src_qname": edge.src_qname,
"dst_symbol_id": edge.dst_symbol_id,
"dst_ref": edge.dst_ref,
"resolution": edge.resolution,
"lang_payload": edge.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
)

View File

@@ -0,0 +1,114 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PyEdge:
edge_id: str
edge_type: str
src_symbol_id: str
src_qname: str
dst_symbol_id: str | None
dst_ref: str | None
path: str
start_line: int
end_line: int
resolution: str = "partial"
metadata: dict = field(default_factory=dict)
class EdgeExtractor:
def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
if ast_tree is None:
return []
qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
visitor = _EdgeVisitor(path, qname_map)
visitor.visit(ast_tree)
return visitor.edges
class _EdgeVisitor(ast.NodeVisitor):
def __init__(self, path: str, qname_map: dict[str, str]) -> None:
self._path = path
self._qname_map = qname_map
self._scope: list[str] = []
self.edges: list[PyEdge] = []
def visit_ClassDef(self, node: ast.ClassDef) -> None:
current = self._enter(node.name)
for base in node.bases:
self._add_edge("inherits", current, self._name(base), base)
self.generic_visit(node)
self._scope.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._visit_function(node)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._visit_function(node)
def visit_Import(self, node: ast.Import) -> None:
current = self._current_qname()
if not current:
return
for item in node.names:
self._add_edge("imports", current, item.name, node)
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
current = self._current_qname()
if not current:
return
module = node.module or ""
for item in node.names:
self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
def _visit_function(self, node) -> None:
current = self._enter(node.name)
for inner in ast.walk(node):
if isinstance(inner, ast.Call):
self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
self.generic_visit(node)
self._scope.pop()
def _enter(self, name: str) -> str:
self._scope.append(name)
return self._current_qname() or name
def _current_qname(self) -> str | None:
if not self._scope:
return None
return ".".join(self._scope)
def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
if not dst_ref:
return
src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
dst_symbol_id = self._qname_map.get(dst_ref)
edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
self.edges.append(
PyEdge(
edge_id=edge_id,
edge_type=edge_type,
src_symbol_id=src_symbol_id,
src_qname=src_qname,
dst_symbol_id=dst_symbol_id,
dst_ref=dst_ref,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
resolution="resolved" if dst_symbol_id else "partial",
metadata=extra or {},
)
)
def _name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._name(node.func)
return ""

View File

@@ -0,0 +1,26 @@
from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class EntrypointDocumentBuilder:
def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
return RagDocument(
layer=RagLayer.CODE_ENTRYPOINTS,
lang="python",
source=source,
title=entrypoint.route_or_command,
text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
span=RagSpan(entrypoint.start_line, entrypoint.end_line),
metadata={
"entry_id": entrypoint.entry_id,
"entry_type": entrypoint.entry_type,
"framework": entrypoint.framework,
"route_or_command": entrypoint.route_or_command,
"handler_symbol_id": entrypoint.handler_symbol_id,
"lang_payload": entrypoint.metadata,
"artifact_type": "CODE",
},
links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
)

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FastApiEntrypointDetector:
_METHODS = {"get", "post", "put", "patch", "delete"}
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
decorators = symbol.decorators or []
for decorator in decorators:
name = decorator.lower()
tail = name.split(".")[-1]
if tail not in self._METHODS and ".route" not in name:
continue
route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
items.append(
Entrypoint(
entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="fastapi",
route_or_command=route,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
)
)
return items

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class FlaskEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".route" not in lowered:
continue
items.append(
Entrypoint(
entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="http",
framework="flask",
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,27 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class Entrypoint:
entry_id: str
entry_type: str
framework: str
route_or_command: str
handler_symbol_id: str
path: str
start_line: int
end_line: int
metadata: dict = field(default_factory=dict)
class EntrypointDetectorRegistry:
def __init__(self, detectors: list) -> None:
self._detectors = detectors
def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for detector in self._detectors:
items.extend(detector.detect(path=path, symbols=symbols))
return items

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
class TyperClickEntrypointDetector:
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
items: list[Entrypoint] = []
for symbol in symbols:
for decorator in symbol.decorators or []:
lowered = decorator.lower()
if ".command" not in lowered and ".callback" not in lowered:
continue
framework = "typer" if "typer" in lowered else "click"
items.append(
Entrypoint(
entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
entry_type="cli",
framework=framework,
route_or_command=decorator,
handler_symbol_id=symbol.symbol_id,
path=path,
start_line=symbol.start_line,
end_line=symbol.end_line,
)
)
return items

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
from pathlib import PurePosixPath
class PythonFileFilter:
_EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if candidate.suffix.lower() != ".py":
return False
return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)

View File

@@ -0,0 +1,52 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
class CodeIndexingPipeline:
def __init__(self) -> None:
self._filter = PythonFileFilter()
self._chunker = CodeTextChunker()
self._code_builder = CodeTextDocumentBuilder()
self._parser = PythonAstParser()
self._symbols = SymbolExtractor()
self._symbol_builder = SymbolDocumentBuilder()
self._edges = EdgeExtractor()
self._edge_builder = EdgeDocumentBuilder()
self._entrypoints = EntrypointDetectorRegistry(
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
)
self._entrypoint_builder = EntrypointDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
docs: list[RagDocument] = []
code_chunks = self._chunker.chunk(path, content)
for index, chunk in enumerate(code_chunks):
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
tree = self._parser.parse_module(content)
symbols = self._symbols.extract(path, content, tree)
for symbol in symbols:
docs.append(self._symbol_builder.build(source, symbol))
edges = self._edges.extract(path, tree, symbols)
for edge in edges:
docs.append(self._edge_builder.build(source, edge))
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
docs.append(self._entrypoint_builder.build(source, entrypoint))
return docs

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
import ast
class PythonAstParser:
def parse_module(self, text: str) -> ast.AST | None:
try:
return ast.parse(text)
except SyntaxError:
return None

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
class SymbolDocumentBuilder:
def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
if symbol.docstring:
body.append(symbol.docstring.strip())
return RagDocument(
layer=RagLayer.CODE_SYMBOL_CATALOG,
lang="python",
source=source,
title=symbol.qname,
text="\n".join(part for part in body if part),
span=RagSpan(symbol.start_line, symbol.end_line),
metadata={
"symbol_id": symbol.symbol_id,
"qname": symbol.qname,
"kind": symbol.kind,
"signature": symbol.signature,
"decorators_or_annotations": symbol.decorators,
"docstring_or_javadoc": symbol.docstring,
"parent_symbol_id": symbol.parent_symbol_id,
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
"is_entry_candidate": bool(symbol.decorators),
"lang_payload": symbol.lang_payload,
"artifact_type": "CODE",
},
)

View File

@@ -0,0 +1,130 @@
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from hashlib import sha256
@dataclass(slots=True)
class PySymbol:
symbol_id: str
qname: str
kind: str
path: str
start_line: int
end_line: int
signature: str
decorators: list[str] = field(default_factory=list)
docstring: str | None = None
parent_symbol_id: str | None = None
lang_payload: dict = field(default_factory=dict)
class SymbolExtractor:
def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
if ast_tree is None:
return []
collector = _SymbolVisitor(path)
collector.visit(ast_tree)
return collector.symbols
class _SymbolVisitor(ast.NodeVisitor):
def __init__(self, path: str) -> None:
self._path = path
self._stack: list[tuple[str, str]] = []
self.symbols: list[PySymbol] = []
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
if self._stack:
return
module = node.module or ""
for item in node.names:
local_name = item.asname or item.name
imported_name = f"{module}.{item.name}".strip(".")
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"{local_name} = {imported_name}",
lang_payload={"imported_from": imported_name, "import_alias": True},
)
)
self.generic_visit(node)
def visit_Import(self, node: ast.Import) -> None:
if self._stack:
return
for item in node.names:
local_name = item.asname or item.name
self.symbols.append(
PySymbol(
symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
qname=local_name,
kind="const",
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=f"import {item.name}",
lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
)
)
self.generic_visit(node)
def visit_ClassDef(self, node: ast.ClassDef) -> None:
self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
self.generic_visit(node)
self._stack.pop()
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
self._add_function(node, is_async=False)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
self._add_function(node, is_async=True)
def _add_function(self, node, *, is_async: bool) -> None:
kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
self._add_symbol(node, kind, {"async": is_async})
self.generic_visit(node)
self._stack.pop()
def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
names = [name for _, name in self._stack] + [node.name]
qname = ".".join(names)
symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
signature = self._signature(node)
symbol = PySymbol(
symbol_id=symbol_id,
qname=qname,
kind=kind,
path=self._path,
start_line=int(getattr(node, "lineno", 1)),
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
signature=signature,
decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
docstring=ast.get_docstring(node),
parent_symbol_id=self._stack[-1][1] if self._stack else None,
lang_payload=lang_payload,
)
self.symbols.append(symbol)
self._stack.append((kind, qname))
def _signature(self, node) -> str:
if isinstance(node, ast.ClassDef):
bases = ", ".join(self._expr_name(base) for base in node.bases)
return f"{node.name}({bases})" if bases else node.name
args = [arg.arg for arg in getattr(node.args, "args", [])]
return f"{node.name}({', '.join(args)})"
def _expr_name(self, node) -> str:
if isinstance(node, ast.Name):
return node.id
if isinstance(node, ast.Attribute):
return f"{self._expr_name(node.value)}.{node.attr}"
if isinstance(node, ast.Call):
return self._expr_name(node.func)
return ast.dump(node, include_attributes=False)