from __future__ import annotations from app.modules.rag.contracts import RagDocument, RagSource from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector from app.modules.rag.indexing.code.file_filter import PythonFileFilter from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor class CodeIndexingPipeline: def __init__(self) -> None: self._filter = PythonFileFilter() self._chunker = CodeTextChunker() self._code_builder = CodeTextDocumentBuilder() self._parser = PythonAstParser() self._symbols = SymbolExtractor() self._symbol_builder = SymbolDocumentBuilder() self._edges = EdgeExtractor() self._edge_builder = EdgeDocumentBuilder() self._entrypoints = EntrypointDetectorRegistry( [FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()] ) self._entrypoint_builder = EntrypointDocumentBuilder() def supports(self, path: str) -> bool: return self._filter.should_index(path) def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) docs: list[RagDocument] = [] code_chunks = self._chunker.chunk(path, content) for index, chunk in enumerate(code_chunks): docs.append(self._code_builder.build(source, chunk, chunk_index=index)) tree = self._parser.parse_module(content) symbols = self._symbols.extract(path, content, tree) for symbol in symbols: docs.append(self._symbol_builder.build(source, symbol)) edges = self._edges.extract(path, tree, symbols) for edge in edges: docs.append(self._edge_builder.build(source, edge)) for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols): docs.append(self._entrypoint_builder.build(source, entrypoint)) return docs