from __future__ import annotations import asyncio import hashlib import logging import os from collections.abc import Awaitable, Callable from inspect import isawaitable from app.modules.rag.contracts import RagDocument from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline from app.modules.rag.indexing.common.report import IndexReport from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline from app.modules.rag.persistence.repository import RagRepository from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder LOGGER = logging.getLogger(__name__) class RagService: def __init__( self, embedder: GigaChatEmbedder, repository: RagRepository, chunker=None, ) -> None: self._embedder = embedder self._repo = repository self._docs = DocsIndexingPipeline() self._code = CodeIndexingPipeline() async def index_snapshot( self, rag_session_id: str, files: list[dict], progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, ) -> tuple[int, int, int, int]: report = await self._index_files(rag_session_id, files, progress_cb=progress_cb) self._repo.replace_documents(rag_session_id, report.documents_list) return report.as_tuple() async def index_changes( self, rag_session_id: str, changed_files: list[dict], progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, ) -> tuple[int, int, int, int]: delete_paths: list[str] = [] upserts: list[dict] = [] for item in changed_files: if str(item.get("op")) == "delete": delete_paths.append(str(item.get("path", ""))) else: upserts.append(item) report = await self._index_files(rag_session_id, upserts, progress_cb=progress_cb) self._repo.apply_document_changes(rag_session_id, delete_paths, report.documents_list) return report.as_tuple() async def _index_files( self, rag_session_id: str, files: list[dict], progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, ) -> "_PipelineReport": total_files = len(files) report = _PipelineReport() repo_id = self._resolve_repo_id(rag_session_id) for index, file in enumerate(files, start=1): path = str(file.get("path", "")) try: blob_sha = self._blob_sha(file) cached = await asyncio.to_thread(self._repo.get_cached_documents, repo_id, blob_sha) pipelines = self._resolve_pipeline_names(path) if cached: report.documents_list.extend(self._with_file_metadata(cached, file, repo_id, blob_sha)) report.cache_hit_files += 1 LOGGER.warning( "rag ingest file: rag_session_id=%s path=%s processing=cache pipeline=%s", rag_session_id, path, ",".join(pipelines), ) else: built = self._build_documents(repo_id, path, file) embedded = await asyncio.to_thread(self._embed_documents, built, file, repo_id, blob_sha) report.documents_list.extend(embedded) await asyncio.to_thread(self._repo.cache_documents, repo_id, path, blob_sha, embedded) report.cache_miss_files += 1 LOGGER.warning( "rag ingest file: rag_session_id=%s path=%s processing=embed pipeline=%s", rag_session_id, path, ",".join(pipelines), ) report.indexed_files += 1 except Exception as exc: report.failed_files += 1 report.warnings.append(f"{path}: {exc}") await self._notify_progress(progress_cb, index, total_files, path) report.documents = len(report.documents_list) return report def _build_documents(self, repo_id: str, path: str, file: dict) -> list[RagDocument]: content = str(file.get("content") or "") commit_sha = file.get("commit_sha") docs: list[RagDocument] = [] if self._docs.supports(path): docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) if self._code.supports(path): docs.extend(self._code.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) if not docs: docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) return docs def _resolve_pipeline_names(self, path: str) -> list[str]: names: list[str] = [] if self._docs.supports(path): names.append("DOCS") if self._code.supports(path): names.append("CODE") if not names: names.append("DOCS") return names def _embed_documents(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: if not docs: return [] batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16"))) metadata = self._document_metadata(file, repo_id, blob_sha) for doc in docs: doc.metadata.update(metadata) for start in range(0, len(docs), batch_size): batch = docs[start : start + batch_size] vectors = self._embedder.embed([doc.text for doc in batch]) for doc, vector in zip(batch, vectors): doc.embedding = vector return docs def _with_file_metadata(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: metadata = self._document_metadata(file, repo_id, blob_sha) for doc in docs: doc.metadata.update(metadata) doc.source.repo_id = repo_id doc.source.path = str(file.get("path", doc.source.path)) return docs def _document_metadata(self, file: dict, repo_id: str, blob_sha: str) -> dict: return { "blob_sha": blob_sha, "repo_id": repo_id, "artifact_type": file.get("artifact_type"), "section": file.get("section"), "doc_id": file.get("doc_id"), "doc_version": file.get("doc_version"), "owner": file.get("owner"), "system_component": file.get("system_component"), "last_modified": file.get("last_modified"), "staleness_score": file.get("staleness_score"), } def _resolve_repo_id(self, rag_session_id: str) -> str: session = self._repo.get_session(rag_session_id) if not session: return rag_session_id return str(session.get("project_id") or rag_session_id) def _blob_sha(self, file: dict) -> str: raw = str(file.get("content_hash") or "").strip() if raw: return raw content = str(file.get("content") or "") return hashlib.sha256(content.encode("utf-8")).hexdigest() async def _notify_progress( self, progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None, current_file_index: int, total_files: int, current_file_name: str, ) -> None: if not progress_cb: return result = progress_cb(current_file_index, total_files, current_file_name) if isawaitable(result): await result class _PipelineReport(IndexReport): def __init__(self) -> None: super().__init__() self.documents_list: list[RagDocument] = []