from __future__ import annotations import re import yaml from app.modules.rag.contracts import RagDocument, RagSource from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker from app.modules.rag.indexing.docs.classifier import DocsClassifier from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder from app.modules.rag.indexing.docs.file_filter import DocsFileFilter class DocsIndexingPipeline: def __init__(self) -> None: self._filter = DocsFileFilter() self._classifier = DocsClassifier() self._chunker = MarkdownDocChunker() self._builder = DocsDocumentBuilder() def supports(self, path: str) -> bool: return self._filter.should_index(path) def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) frontmatter, body = self._split_frontmatter(content) doc_kind = self._classifier.classify(path) sections = self._chunker.chunk(body) summary_text = self._summary_from_sections(sections) docs: list[RagDocument] = [] module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind) if module_doc is not None: docs.append(module_doc) for section in sections: docs.append(self._builder.build_section(source, section, frontmatter, doc_kind)) if str(frontmatter.get("type") or "").strip() == "policy": for section in sections[:1]: policy = self._builder.build_policy(source, frontmatter, section, doc_kind) if policy is not None: docs.append(policy) docs.extend(self._extract_facts(source, frontmatter, sections)) return docs def _split_frontmatter(self, content: str) -> tuple[dict, str]: if not content.startswith("---\n"): return {}, content _, raw, body = content.split("---", 2) payload = yaml.safe_load(raw) or {} return payload if isinstance(payload, dict) else {}, body.strip() def _summary_from_sections(self, sections) -> str: text = "\n\n".join(section.content for section in sections[:2]).strip() return text[:4000] def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]: subject_id = str(frontmatter.get("id") or source.path) docs: list[RagDocument] = [] links = frontmatter.get("links") or {} for predicate, values in links.items(): for value in values or []: docs.append( self._builder.build_fact( source, subject_id=subject_id, predicate=predicate, obj=str(value), object_ref=str(value), anchor="frontmatter.links", ) ) for section in sections: docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content)) docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content)) return docs def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: lines = [line.strip() for line in content.splitlines() if line.strip()] if len(lines) < 3 or "|" not in lines[0]: return [] headers = [part.strip().lower() for part in lines[0].strip("|").split("|")] if not all(headers): return [] docs: list[RagDocument] = [] for row in lines[2:]: if "|" not in row: continue values = [part.strip() for part in row.strip("|").split("|")] if len(values) != len(headers): continue payload = dict(zip(headers, values)) docs.extend(self._facts_from_row(source, subject_id, title, payload)) return docs def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]: docs: list[RagDocument] = [] name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code") if "request" in title.lower() or "response" in title.lower(): if name: docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title)) if payload.get("required"): docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title)) if payload.get("type"): docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title)) if payload.get("validation"): docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title)) if "error" in title.lower(): if payload.get("status"): docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title)) if payload.get("error") or payload.get("code"): error_value = payload.get("error") or payload.get("code") docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title)) if payload.get("client action"): docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title)) if "constraint" in title.lower() and name: docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title)) return docs def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: docs: list[RagDocument] = [] for line in content.splitlines(): item = line.strip() if not item.startswith(("-", "*", "1.", "2.", "3.")): continue normalized = re.sub(r"^[-*0-9. ]+", "", item).strip() lowered = normalized.lower() if lowered.startswith("metric:"): predicate = "emits_metric" elif lowered.startswith("event:"): predicate = "emits_analytics_event" elif lowered.startswith("log:"): predicate = "logs_event" else: predicate = "validates_rule" if "rule" in title.lower() else "client_action" docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title)) return docs