Новый раг
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CodeChunk:
|
||||
title: str
|
||||
text: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
chunk_type: str
|
||||
|
||||
|
||||
class CodeTextChunker:
|
||||
def chunk(self, path: str, text: str) -> list[CodeChunk]:
|
||||
try:
|
||||
tree = ast.parse(text)
|
||||
except SyntaxError:
|
||||
return self._window_chunks(path, text)
|
||||
chunks: list[CodeChunk] = []
|
||||
lines = text.splitlines()
|
||||
for node in tree.body:
|
||||
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
continue
|
||||
start = int(getattr(node, "lineno", 1))
|
||||
end = int(getattr(node, "end_lineno", start))
|
||||
body = "\n".join(lines[start - 1 : end]).strip()
|
||||
if not body:
|
||||
continue
|
||||
chunks.append(
|
||||
CodeChunk(
|
||||
title=f"{path}:{getattr(node, 'name', 'block')}",
|
||||
text=body,
|
||||
start_line=start,
|
||||
end_line=end,
|
||||
chunk_type="symbol_block",
|
||||
)
|
||||
)
|
||||
return chunks or self._window_chunks(path, text)
|
||||
|
||||
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
|
||||
lines = text.splitlines()
|
||||
chunks: list[CodeChunk] = []
|
||||
size = 80
|
||||
overlap = 15
|
||||
start = 0
|
||||
while start < len(lines):
|
||||
end = min(len(lines), start + size)
|
||||
body = "\n".join(lines[start:end]).strip()
|
||||
if body:
|
||||
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
|
||||
if end >= len(lines):
|
||||
break
|
||||
start = max(0, end - overlap)
|
||||
return chunks
|
||||
22
app/modules/rag/indexing/code/code_text/document_builder.py
Normal file
22
app/modules/rag/indexing/code/code_text/document_builder.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
|
||||
|
||||
|
||||
class CodeTextDocumentBuilder:
|
||||
def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument:
|
||||
return RagDocument(
|
||||
layer=RagLayer.CODE_SOURCE_CHUNKS,
|
||||
lang="python",
|
||||
source=source,
|
||||
title=chunk.title,
|
||||
text=chunk.text,
|
||||
span=RagSpan(chunk.start_line, chunk.end_line),
|
||||
metadata={
|
||||
"chunk_index": chunk_index,
|
||||
"chunk_type": chunk.chunk_type,
|
||||
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
29
app/modules/rag/indexing/code/edges/document_builder.py
Normal file
29
app/modules/rag/indexing/code/edges/document_builder.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.edges.extractor import PyEdge
|
||||
|
||||
|
||||
class EdgeDocumentBuilder:
|
||||
def build(self, source: RagSource, edge: PyEdge) -> RagDocument:
|
||||
dst = edge.dst_ref or edge.dst_symbol_id or "unknown"
|
||||
return RagDocument(
|
||||
layer=RagLayer.CODE_DEPENDENCY_GRAPH,
|
||||
lang="python",
|
||||
source=source,
|
||||
title=f"{edge.src_qname}:{edge.edge_type}",
|
||||
text=f"{edge.src_qname} {edge.edge_type} {dst}",
|
||||
span=RagSpan(edge.start_line, edge.end_line),
|
||||
metadata={
|
||||
"edge_id": edge.edge_id,
|
||||
"edge_type": edge.edge_type,
|
||||
"src_symbol_id": edge.src_symbol_id,
|
||||
"src_qname": edge.src_qname,
|
||||
"dst_symbol_id": edge.dst_symbol_id,
|
||||
"dst_ref": edge.dst_ref,
|
||||
"resolution": edge.resolution,
|
||||
"lang_payload": edge.metadata,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)],
|
||||
)
|
||||
114
app/modules/rag/indexing/code/edges/extractor.py
Normal file
114
app/modules/rag/indexing/code/edges/extractor.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from dataclasses import dataclass, field
|
||||
from hashlib import sha256
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PyEdge:
|
||||
edge_id: str
|
||||
edge_type: str
|
||||
src_symbol_id: str
|
||||
src_qname: str
|
||||
dst_symbol_id: str | None
|
||||
dst_ref: str | None
|
||||
path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
resolution: str = "partial"
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class EdgeExtractor:
|
||||
def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]:
|
||||
if ast_tree is None:
|
||||
return []
|
||||
qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols}
|
||||
visitor = _EdgeVisitor(path, qname_map)
|
||||
visitor.visit(ast_tree)
|
||||
return visitor.edges
|
||||
|
||||
|
||||
class _EdgeVisitor(ast.NodeVisitor):
|
||||
def __init__(self, path: str, qname_map: dict[str, str]) -> None:
|
||||
self._path = path
|
||||
self._qname_map = qname_map
|
||||
self._scope: list[str] = []
|
||||
self.edges: list[PyEdge] = []
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||
current = self._enter(node.name)
|
||||
for base in node.bases:
|
||||
self._add_edge("inherits", current, self._name(base), base)
|
||||
self.generic_visit(node)
|
||||
self._scope.pop()
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
self._visit_function(node)
|
||||
|
||||
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||
self._visit_function(node)
|
||||
|
||||
def visit_Import(self, node: ast.Import) -> None:
|
||||
current = self._current_qname()
|
||||
if not current:
|
||||
return
|
||||
for item in node.names:
|
||||
self._add_edge("imports", current, item.name, node)
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||
current = self._current_qname()
|
||||
if not current:
|
||||
return
|
||||
module = node.module or ""
|
||||
for item in node.names:
|
||||
self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node)
|
||||
|
||||
def _visit_function(self, node) -> None:
|
||||
current = self._enter(node.name)
|
||||
for inner in ast.walk(node):
|
||||
if isinstance(inner, ast.Call):
|
||||
self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"})
|
||||
self.generic_visit(node)
|
||||
self._scope.pop()
|
||||
|
||||
def _enter(self, name: str) -> str:
|
||||
self._scope.append(name)
|
||||
return self._current_qname() or name
|
||||
|
||||
def _current_qname(self) -> str | None:
|
||||
if not self._scope:
|
||||
return None
|
||||
return ".".join(self._scope)
|
||||
|
||||
def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None:
|
||||
if not dst_ref:
|
||||
return
|
||||
src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest())
|
||||
dst_symbol_id = self._qname_map.get(dst_ref)
|
||||
edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest()
|
||||
self.edges.append(
|
||||
PyEdge(
|
||||
edge_id=edge_id,
|
||||
edge_type=edge_type,
|
||||
src_symbol_id=src_symbol_id,
|
||||
src_qname=src_qname,
|
||||
dst_symbol_id=dst_symbol_id,
|
||||
dst_ref=dst_ref,
|
||||
path=self._path,
|
||||
start_line=int(getattr(node, "lineno", 1)),
|
||||
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||
resolution="resolved" if dst_symbol_id else "partial",
|
||||
metadata=extra or {},
|
||||
)
|
||||
)
|
||||
|
||||
def _name(self, node) -> str:
|
||||
if isinstance(node, ast.Name):
|
||||
return node.id
|
||||
if isinstance(node, ast.Attribute):
|
||||
return f"{self._name(node.value)}.{node.attr}"
|
||||
if isinstance(node, ast.Call):
|
||||
return self._name(node.func)
|
||||
return ""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||
|
||||
|
||||
class EntrypointDocumentBuilder:
|
||||
def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument:
|
||||
return RagDocument(
|
||||
layer=RagLayer.CODE_ENTRYPOINTS,
|
||||
lang="python",
|
||||
source=source,
|
||||
title=entrypoint.route_or_command,
|
||||
text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}",
|
||||
span=RagSpan(entrypoint.start_line, entrypoint.end_line),
|
||||
metadata={
|
||||
"entry_id": entrypoint.entry_id,
|
||||
"entry_type": entrypoint.entry_type,
|
||||
"framework": entrypoint.framework,
|
||||
"route_or_command": entrypoint.route_or_command,
|
||||
"handler_symbol_id": entrypoint.handler_symbol_id,
|
||||
"lang_payload": entrypoint.metadata,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)],
|
||||
)
|
||||
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||
|
||||
|
||||
class FastApiEntrypointDetector:
|
||||
_METHODS = {"get", "post", "put", "patch", "delete"}
|
||||
|
||||
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||
items: list[Entrypoint] = []
|
||||
for symbol in symbols:
|
||||
decorators = symbol.decorators or []
|
||||
for decorator in decorators:
|
||||
name = decorator.lower()
|
||||
tail = name.split(".")[-1]
|
||||
if tail not in self._METHODS and ".route" not in name:
|
||||
continue
|
||||
route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator
|
||||
items.append(
|
||||
Entrypoint(
|
||||
entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||
entry_type="http",
|
||||
framework="fastapi",
|
||||
route_or_command=route,
|
||||
handler_symbol_id=symbol.symbol_id,
|
||||
path=path,
|
||||
start_line=symbol.start_line,
|
||||
end_line=symbol.end_line,
|
||||
metadata={"methods": [tail.upper()] if tail in self._METHODS else []},
|
||||
)
|
||||
)
|
||||
return items
|
||||
28
app/modules/rag/indexing/code/entrypoints/flask_detector.py
Normal file
28
app/modules/rag/indexing/code/entrypoints/flask_detector.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||
|
||||
|
||||
class FlaskEntrypointDetector:
|
||||
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||
items: list[Entrypoint] = []
|
||||
for symbol in symbols:
|
||||
for decorator in symbol.decorators or []:
|
||||
lowered = decorator.lower()
|
||||
if ".route" not in lowered:
|
||||
continue
|
||||
items.append(
|
||||
Entrypoint(
|
||||
entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||
entry_type="http",
|
||||
framework="flask",
|
||||
route_or_command=decorator,
|
||||
handler_symbol_id=symbol.symbol_id,
|
||||
path=path,
|
||||
start_line=symbol.start_line,
|
||||
end_line=symbol.end_line,
|
||||
)
|
||||
)
|
||||
return items
|
||||
27
app/modules/rag/indexing/code/entrypoints/registry.py
Normal file
27
app/modules/rag/indexing/code/entrypoints/registry.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Entrypoint:
|
||||
entry_id: str
|
||||
entry_type: str
|
||||
framework: str
|
||||
route_or_command: str
|
||||
handler_symbol_id: str
|
||||
path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class EntrypointDetectorRegistry:
|
||||
def __init__(self, detectors: list) -> None:
|
||||
self._detectors = detectors
|
||||
|
||||
def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||
items: list[Entrypoint] = []
|
||||
for detector in self._detectors:
|
||||
items.extend(detector.detect(path=path, symbols=symbols))
|
||||
return items
|
||||
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||
|
||||
|
||||
class TyperClickEntrypointDetector:
|
||||
def detect(self, *, path: str, symbols: list) -> list[Entrypoint]:
|
||||
items: list[Entrypoint] = []
|
||||
for symbol in symbols:
|
||||
for decorator in symbol.decorators or []:
|
||||
lowered = decorator.lower()
|
||||
if ".command" not in lowered and ".callback" not in lowered:
|
||||
continue
|
||||
framework = "typer" if "typer" in lowered else "click"
|
||||
items.append(
|
||||
Entrypoint(
|
||||
entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(),
|
||||
entry_type="cli",
|
||||
framework=framework,
|
||||
route_or_command=decorator,
|
||||
handler_symbol_id=symbol.symbol_id,
|
||||
path=path,
|
||||
start_line=symbol.start_line,
|
||||
end_line=symbol.end_line,
|
||||
)
|
||||
)
|
||||
return items
|
||||
13
app/modules/rag/indexing/code/file_filter.py
Normal file
13
app/modules/rag/indexing/code/file_filter.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
|
||||
class PythonFileFilter:
|
||||
_EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"}
|
||||
|
||||
def should_index(self, path: str) -> bool:
|
||||
candidate = PurePosixPath(path)
|
||||
if candidate.suffix.lower() != ".py":
|
||||
return False
|
||||
return not any(part in self._EXCLUDE_PARTS for part in candidate.parts)
|
||||
52
app/modules/rag/indexing/code/pipeline.py
Normal file
52
app/modules/rag/indexing/code/pipeline.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagSource
|
||||
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
|
||||
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
|
||||
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
|
||||
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
|
||||
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
|
||||
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
|
||||
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
|
||||
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
|
||||
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
|
||||
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
|
||||
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
|
||||
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
|
||||
|
||||
|
||||
class CodeIndexingPipeline:
|
||||
def __init__(self) -> None:
|
||||
self._filter = PythonFileFilter()
|
||||
self._chunker = CodeTextChunker()
|
||||
self._code_builder = CodeTextDocumentBuilder()
|
||||
self._parser = PythonAstParser()
|
||||
self._symbols = SymbolExtractor()
|
||||
self._symbol_builder = SymbolDocumentBuilder()
|
||||
self._edges = EdgeExtractor()
|
||||
self._edge_builder = EdgeDocumentBuilder()
|
||||
self._entrypoints = EntrypointDetectorRegistry(
|
||||
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
|
||||
)
|
||||
self._entrypoint_builder = EntrypointDocumentBuilder()
|
||||
|
||||
def supports(self, path: str) -> bool:
|
||||
return self._filter.should_index(path)
|
||||
|
||||
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
||||
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
||||
docs: list[RagDocument] = []
|
||||
code_chunks = self._chunker.chunk(path, content)
|
||||
for index, chunk in enumerate(code_chunks):
|
||||
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
|
||||
tree = self._parser.parse_module(content)
|
||||
symbols = self._symbols.extract(path, content, tree)
|
||||
for symbol in symbols:
|
||||
docs.append(self._symbol_builder.build(source, symbol))
|
||||
edges = self._edges.extract(path, tree, symbols)
|
||||
for edge in edges:
|
||||
docs.append(self._edge_builder.build(source, edge))
|
||||
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
|
||||
docs.append(self._entrypoint_builder.build(source, entrypoint))
|
||||
return docs
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
11
app/modules/rag/indexing/code/symbols/ast_parser.py
Normal file
11
app/modules/rag/indexing/code/symbols/ast_parser.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
|
||||
|
||||
class PythonAstParser:
|
||||
def parse_module(self, text: str) -> ast.AST | None:
|
||||
try:
|
||||
return ast.parse(text)
|
||||
except SyntaxError:
|
||||
return None
|
||||
32
app/modules/rag/indexing/code/symbols/document_builder.py
Normal file
32
app/modules/rag/indexing/code/symbols/document_builder.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
|
||||
|
||||
|
||||
class SymbolDocumentBuilder:
|
||||
def build(self, source: RagSource, symbol: PySymbol) -> RagDocument:
|
||||
body = [f"{symbol.kind} {symbol.qname}", symbol.signature]
|
||||
if symbol.docstring:
|
||||
body.append(symbol.docstring.strip())
|
||||
return RagDocument(
|
||||
layer=RagLayer.CODE_SYMBOL_CATALOG,
|
||||
lang="python",
|
||||
source=source,
|
||||
title=symbol.qname,
|
||||
text="\n".join(part for part in body if part),
|
||||
span=RagSpan(symbol.start_line, symbol.end_line),
|
||||
metadata={
|
||||
"symbol_id": symbol.symbol_id,
|
||||
"qname": symbol.qname,
|
||||
"kind": symbol.kind,
|
||||
"signature": symbol.signature,
|
||||
"decorators_or_annotations": symbol.decorators,
|
||||
"docstring_or_javadoc": symbol.docstring,
|
||||
"parent_symbol_id": symbol.parent_symbol_id,
|
||||
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
|
||||
"is_entry_candidate": bool(symbol.decorators),
|
||||
"lang_payload": symbol.lang_payload,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
)
|
||||
130
app/modules/rag/indexing/code/symbols/extractor.py
Normal file
130
app/modules/rag/indexing/code/symbols/extractor.py
Normal file
@@ -0,0 +1,130 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from dataclasses import dataclass, field
|
||||
from hashlib import sha256
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PySymbol:
|
||||
symbol_id: str
|
||||
qname: str
|
||||
kind: str
|
||||
path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
signature: str
|
||||
decorators: list[str] = field(default_factory=list)
|
||||
docstring: str | None = None
|
||||
parent_symbol_id: str | None = None
|
||||
lang_payload: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class SymbolExtractor:
|
||||
def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]:
|
||||
if ast_tree is None:
|
||||
return []
|
||||
collector = _SymbolVisitor(path)
|
||||
collector.visit(ast_tree)
|
||||
return collector.symbols
|
||||
|
||||
|
||||
class _SymbolVisitor(ast.NodeVisitor):
|
||||
def __init__(self, path: str) -> None:
|
||||
self._path = path
|
||||
self._stack: list[tuple[str, str]] = []
|
||||
self.symbols: list[PySymbol] = []
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||
if self._stack:
|
||||
return
|
||||
module = node.module or ""
|
||||
for item in node.names:
|
||||
local_name = item.asname or item.name
|
||||
imported_name = f"{module}.{item.name}".strip(".")
|
||||
self.symbols.append(
|
||||
PySymbol(
|
||||
symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(),
|
||||
qname=local_name,
|
||||
kind="const",
|
||||
path=self._path,
|
||||
start_line=int(getattr(node, "lineno", 1)),
|
||||
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||
signature=f"{local_name} = {imported_name}",
|
||||
lang_payload={"imported_from": imported_name, "import_alias": True},
|
||||
)
|
||||
)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Import(self, node: ast.Import) -> None:
|
||||
if self._stack:
|
||||
return
|
||||
for item in node.names:
|
||||
local_name = item.asname or item.name
|
||||
self.symbols.append(
|
||||
PySymbol(
|
||||
symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(),
|
||||
qname=local_name,
|
||||
kind="const",
|
||||
path=self._path,
|
||||
start_line=int(getattr(node, "lineno", 1)),
|
||||
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||
signature=f"import {item.name}",
|
||||
lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)},
|
||||
)
|
||||
)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||
self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]})
|
||||
self.generic_visit(node)
|
||||
self._stack.pop()
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
self._add_function(node, is_async=False)
|
||||
|
||||
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||
self._add_function(node, is_async=True)
|
||||
|
||||
def _add_function(self, node, *, is_async: bool) -> None:
|
||||
kind = "method" if self._stack and self._stack[-1][0] == "class" else "function"
|
||||
self._add_symbol(node, kind, {"async": is_async})
|
||||
self.generic_visit(node)
|
||||
self._stack.pop()
|
||||
|
||||
def _add_symbol(self, node, kind: str, lang_payload: dict) -> None:
|
||||
names = [name for _, name in self._stack] + [node.name]
|
||||
qname = ".".join(names)
|
||||
symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest()
|
||||
signature = self._signature(node)
|
||||
symbol = PySymbol(
|
||||
symbol_id=symbol_id,
|
||||
qname=qname,
|
||||
kind=kind,
|
||||
path=self._path,
|
||||
start_line=int(getattr(node, "lineno", 1)),
|
||||
end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))),
|
||||
signature=signature,
|
||||
decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])],
|
||||
docstring=ast.get_docstring(node),
|
||||
parent_symbol_id=self._stack[-1][1] if self._stack else None,
|
||||
lang_payload=lang_payload,
|
||||
)
|
||||
self.symbols.append(symbol)
|
||||
self._stack.append((kind, qname))
|
||||
|
||||
def _signature(self, node) -> str:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
bases = ", ".join(self._expr_name(base) for base in node.bases)
|
||||
return f"{node.name}({bases})" if bases else node.name
|
||||
args = [arg.arg for arg in getattr(node.args, "args", [])]
|
||||
return f"{node.name}({', '.join(args)})"
|
||||
|
||||
def _expr_name(self, node) -> str:
|
||||
if isinstance(node, ast.Name):
|
||||
return node.id
|
||||
if isinstance(node, ast.Attribute):
|
||||
return f"{self._expr_name(node.value)}.{node.attr}"
|
||||
if isinstance(node, ast.Call):
|
||||
return self._expr_name(node.func)
|
||||
return ast.dump(node, include_attributes=False)
|
||||
Reference in New Issue
Block a user