Files
agent/app/modules/rag/indexing/docs/pipeline.py
2026-03-01 14:21:33 +03:00

136 lines
7.1 KiB
Python

from __future__ import annotations
import re
import yaml
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.modules.rag.indexing.docs.classifier import DocsClassifier
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
class DocsIndexingPipeline:
def __init__(self) -> None:
self._filter = DocsFileFilter()
self._classifier = DocsClassifier()
self._chunker = MarkdownDocChunker()
self._builder = DocsDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
frontmatter, body = self._split_frontmatter(content)
doc_kind = self._classifier.classify(path)
sections = self._chunker.chunk(body)
summary_text = self._summary_from_sections(sections)
docs: list[RagDocument] = []
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
if module_doc is not None:
docs.append(module_doc)
for section in sections:
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
if str(frontmatter.get("type") or "").strip() == "policy":
for section in sections[:1]:
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
if policy is not None:
docs.append(policy)
docs.extend(self._extract_facts(source, frontmatter, sections))
return docs
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
if not content.startswith("---\n"):
return {}, content
_, raw, body = content.split("---", 2)
payload = yaml.safe_load(raw) or {}
return payload if isinstance(payload, dict) else {}, body.strip()
def _summary_from_sections(self, sections) -> str:
text = "\n\n".join(section.content for section in sections[:2]).strip()
return text[:4000]
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
subject_id = str(frontmatter.get("id") or source.path)
docs: list[RagDocument] = []
links = frontmatter.get("links") or {}
for predicate, values in links.items():
for value in values or []:
docs.append(
self._builder.build_fact(
source,
subject_id=subject_id,
predicate=predicate,
obj=str(value),
object_ref=str(value),
anchor="frontmatter.links",
)
)
for section in sections:
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
return docs
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
lines = [line.strip() for line in content.splitlines() if line.strip()]
if len(lines) < 3 or "|" not in lines[0]:
return []
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
if not all(headers):
return []
docs: list[RagDocument] = []
for row in lines[2:]:
if "|" not in row:
continue
values = [part.strip() for part in row.strip("|").split("|")]
if len(values) != len(headers):
continue
payload = dict(zip(headers, values))
docs.extend(self._facts_from_row(source, subject_id, title, payload))
return docs
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
docs: list[RagDocument] = []
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
if "request" in title.lower() or "response" in title.lower():
if name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
if payload.get("required"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
if payload.get("type"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
if payload.get("validation"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
if "error" in title.lower():
if payload.get("status"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
if payload.get("error") or payload.get("code"):
error_value = payload.get("error") or payload.get("code")
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
if payload.get("client action"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
if "constraint" in title.lower() and name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
return docs
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
docs: list[RagDocument] = []
for line in content.splitlines():
item = line.strip()
if not item.startswith(("-", "*", "1.", "2.", "3.")):
continue
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
lowered = normalized.lower()
if lowered.startswith("metric:"):
predicate = "emits_metric"
elif lowered.startswith("event:"):
predicate = "emits_analytics_event"
elif lowered.startswith("log:"):
predicate = "logs_event"
else:
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
return docs