agent/app/modules/rag/indexing/docs/pipeline.py

from __future__ import annotations

import re

import yaml

from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.modules.rag.indexing.docs.classifier import DocsClassifier
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter


class DocsIndexingPipeline:
    def __init__(self) -> None:
        self._filter = DocsFileFilter()
        self._classifier = DocsClassifier()
        self._chunker = MarkdownDocChunker()
        self._builder = DocsDocumentBuilder()

    def supports(self, path: str) -> bool:
        return self._filter.should_index(path)

    def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
        source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
        frontmatter, body = self._split_frontmatter(content)
        doc_kind = self._classifier.classify(path)
        sections = self._chunker.chunk(body)
        summary_text = self._summary_from_sections(sections)
        docs: list[RagDocument] = []
        module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
        if module_doc is not None:
            docs.append(module_doc)
        for section in sections:
            docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
        if str(frontmatter.get("type") or "").strip() == "policy":
            for section in sections[:1]:
                policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
                if policy is not None:
                    docs.append(policy)
        docs.extend(self._extract_facts(source, frontmatter, sections))
        return docs

    def _split_frontmatter(self, content: str) -> tuple[dict, str]:
        if not content.startswith("---\n"):
            return {}, content
        _, raw, body = content.split("---", 2)
        payload = yaml.safe_load(raw) or {}
        return payload if isinstance(payload, dict) else {}, body.strip()

    def _summary_from_sections(self, sections) -> str:
        text = "\n\n".join(section.content for section in sections[:2]).strip()
        return text[:4000]

    def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
        subject_id = str(frontmatter.get("id") or source.path)
        docs: list[RagDocument] = []
        links = frontmatter.get("links") or {}
        for predicate, values in links.items():
            for value in values or []:
                docs.append(
                    self._builder.build_fact(
                        source,
                        subject_id=subject_id,
                        predicate=predicate,
                        obj=str(value),
                        object_ref=str(value),
                        anchor="frontmatter.links",
                    )
                )
        for section in sections:
            docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
            docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
        return docs

    def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
        lines = [line.strip() for line in content.splitlines() if line.strip()]
        if len(lines) < 3 or "|" not in lines[0]:
            return []
        headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
        if not all(headers):
            return []
        docs: list[RagDocument] = []
        for row in lines[2:]:
            if "|" not in row:
                continue
            values = [part.strip() for part in row.strip("|").split("|")]
            if len(values) != len(headers):
                continue
            payload = dict(zip(headers, values))
            docs.extend(self._facts_from_row(source, subject_id, title, payload))
        return docs

    def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
        docs: list[RagDocument] = []
        name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
        if "request" in title.lower() or "response" in title.lower():
            if name:
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
            if payload.get("required"):
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
            if payload.get("type"):
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
            if payload.get("validation"):
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
        if "error" in title.lower():
            if payload.get("status"):
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
            if payload.get("error") or payload.get("code"):
                error_value = payload.get("error") or payload.get("code")
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
            if payload.get("client action"):
                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
        if "constraint" in title.lower() and name:
            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
        return docs

    def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
        docs: list[RagDocument] = []
        for line in content.splitlines():
            item = line.strip()
            if not item.startswith(("-", "*", "1.", "2.", "3.")):
                continue
            normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
            lowered = normalized.lower()
            if lowered.startswith("metric:"):
                predicate = "emits_metric"
            elif lowered.startswith("event:"):
                predicate = "emits_analytics_event"
            elif lowered.startswith("log:"):
                predicate = "logs_event"
            else:
                predicate = "validates_rule" if "rule" in title.lower() else "client_action"
            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
        return docs