53 lines
2.9 KiB
Python
53 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
from app.modules.rag.contracts import RagDocument, RagSource
|
|
from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker
|
|
from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder
|
|
from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder
|
|
from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor
|
|
from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder
|
|
from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector
|
|
from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector
|
|
from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry
|
|
from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector
|
|
from app.modules.rag.indexing.code.file_filter import PythonFileFilter
|
|
from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser
|
|
from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder
|
|
from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor
|
|
|
|
|
|
class CodeIndexingPipeline:
|
|
def __init__(self) -> None:
|
|
self._filter = PythonFileFilter()
|
|
self._chunker = CodeTextChunker()
|
|
self._code_builder = CodeTextDocumentBuilder()
|
|
self._parser = PythonAstParser()
|
|
self._symbols = SymbolExtractor()
|
|
self._symbol_builder = SymbolDocumentBuilder()
|
|
self._edges = EdgeExtractor()
|
|
self._edge_builder = EdgeDocumentBuilder()
|
|
self._entrypoints = EntrypointDetectorRegistry(
|
|
[FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()]
|
|
)
|
|
self._entrypoint_builder = EntrypointDocumentBuilder()
|
|
|
|
def supports(self, path: str) -> bool:
|
|
return self._filter.should_index(path)
|
|
|
|
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
|
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
|
docs: list[RagDocument] = []
|
|
code_chunks = self._chunker.chunk(path, content)
|
|
for index, chunk in enumerate(code_chunks):
|
|
docs.append(self._code_builder.build(source, chunk, chunk_index=index))
|
|
tree = self._parser.parse_module(content)
|
|
symbols = self._symbols.extract(path, content, tree)
|
|
for symbol in symbols:
|
|
docs.append(self._symbol_builder.build(source, symbol))
|
|
edges = self._edges.extract(path, tree, symbols)
|
|
for edge in edges:
|
|
docs.append(self._edge_builder.build(source, edge))
|
|
for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols):
|
|
docs.append(self._entrypoint_builder.build(source, entrypoint))
|
|
return docs
|