136 lines
7.1 KiB
Python
136 lines
7.1 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
import yaml
|
|
|
|
from app.modules.rag.contracts import RagDocument, RagSource
|
|
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
|
|
from app.modules.rag.indexing.docs.classifier import DocsClassifier
|
|
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
|
|
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
|
|
|
|
|
|
class DocsIndexingPipeline:
|
|
def __init__(self) -> None:
|
|
self._filter = DocsFileFilter()
|
|
self._classifier = DocsClassifier()
|
|
self._chunker = MarkdownDocChunker()
|
|
self._builder = DocsDocumentBuilder()
|
|
|
|
def supports(self, path: str) -> bool:
|
|
return self._filter.should_index(path)
|
|
|
|
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
|
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
|
frontmatter, body = self._split_frontmatter(content)
|
|
doc_kind = self._classifier.classify(path)
|
|
sections = self._chunker.chunk(body)
|
|
summary_text = self._summary_from_sections(sections)
|
|
docs: list[RagDocument] = []
|
|
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
|
|
if module_doc is not None:
|
|
docs.append(module_doc)
|
|
for section in sections:
|
|
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
|
|
if str(frontmatter.get("type") or "").strip() == "policy":
|
|
for section in sections[:1]:
|
|
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
|
|
if policy is not None:
|
|
docs.append(policy)
|
|
docs.extend(self._extract_facts(source, frontmatter, sections))
|
|
return docs
|
|
|
|
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
|
|
if not content.startswith("---\n"):
|
|
return {}, content
|
|
_, raw, body = content.split("---", 2)
|
|
payload = yaml.safe_load(raw) or {}
|
|
return payload if isinstance(payload, dict) else {}, body.strip()
|
|
|
|
def _summary_from_sections(self, sections) -> str:
|
|
text = "\n\n".join(section.content for section in sections[:2]).strip()
|
|
return text[:4000]
|
|
|
|
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
|
|
subject_id = str(frontmatter.get("id") or source.path)
|
|
docs: list[RagDocument] = []
|
|
links = frontmatter.get("links") or {}
|
|
for predicate, values in links.items():
|
|
for value in values or []:
|
|
docs.append(
|
|
self._builder.build_fact(
|
|
source,
|
|
subject_id=subject_id,
|
|
predicate=predicate,
|
|
obj=str(value),
|
|
object_ref=str(value),
|
|
anchor="frontmatter.links",
|
|
)
|
|
)
|
|
for section in sections:
|
|
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
|
|
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
|
|
return docs
|
|
|
|
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
|
lines = [line.strip() for line in content.splitlines() if line.strip()]
|
|
if len(lines) < 3 or "|" not in lines[0]:
|
|
return []
|
|
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
|
|
if not all(headers):
|
|
return []
|
|
docs: list[RagDocument] = []
|
|
for row in lines[2:]:
|
|
if "|" not in row:
|
|
continue
|
|
values = [part.strip() for part in row.strip("|").split("|")]
|
|
if len(values) != len(headers):
|
|
continue
|
|
payload = dict(zip(headers, values))
|
|
docs.extend(self._facts_from_row(source, subject_id, title, payload))
|
|
return docs
|
|
|
|
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
|
|
docs: list[RagDocument] = []
|
|
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
|
|
if "request" in title.lower() or "response" in title.lower():
|
|
if name:
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
|
|
if payload.get("required"):
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
|
|
if payload.get("type"):
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
|
|
if payload.get("validation"):
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
|
|
if "error" in title.lower():
|
|
if payload.get("status"):
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
|
|
if payload.get("error") or payload.get("code"):
|
|
error_value = payload.get("error") or payload.get("code")
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
|
|
if payload.get("client action"):
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
|
|
if "constraint" in title.lower() and name:
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
|
|
return docs
|
|
|
|
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
|
docs: list[RagDocument] = []
|
|
for line in content.splitlines():
|
|
item = line.strip()
|
|
if not item.startswith(("-", "*", "1.", "2.", "3.")):
|
|
continue
|
|
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
|
|
lowered = normalized.lower()
|
|
if lowered.startswith("metric:"):
|
|
predicate = "emits_metric"
|
|
elif lowered.startswith("event:"):
|
|
predicate = "emits_analytics_event"
|
|
elif lowered.startswith("log:"):
|
|
predicate = "logs_event"
|
|
else:
|
|
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
|
|
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
|
|
return docs
|