from __future__ import annotations from hashlib import sha256 from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk class DocsDocumentBuilder: def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None: module_id = str(frontmatter.get("id") or "").strip() module_type = str(frontmatter.get("type") or "").strip() domain = str(frontmatter.get("domain") or "").strip() if not module_id or not module_type or not domain: return None links = frontmatter.get("links") or {} metadata = { "module_id": module_id, "type": module_type, "domain": domain, "status": frontmatter.get("status"), "version": frontmatter.get("version"), "tags": frontmatter.get("tags") or [], "owners": frontmatter.get("owners") or [], "links": links, "source_path": source.path, "summary_text": summary_text[:4000], "doc_kind": doc_kind, } metadata.update({name: links.get(name, []) for name in ( "calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db", "integrates_with", "emits_events", "consumes_events", )}) return RagDocument( layer=RagLayer.DOCS_MODULE_CATALOG, source=source, title=module_id, text=summary_text[:4000] or module_id, metadata=metadata, ) def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument: module_id = str(frontmatter.get("id") or source.path) metadata = { "module_id": module_id, "type": frontmatter.get("type"), "domain": frontmatter.get("domain"), "tags": frontmatter.get("tags") or [], "section_path": chunk.section_path, "section_title": chunk.section_title, "order": chunk.order, "doc_kind": doc_kind, "source_path": source.path, "artifact_type": "DOCS", } return RagDocument( layer=RagLayer.DOCS_SECTION_INDEX, source=source, title=f"{module_id}:{chunk.section_title}", text=chunk.content, metadata=metadata, ) def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None: policy_id = str(frontmatter.get("id") or "").strip() applies_to = frontmatter.get("applies_to") or frontmatter.get("type") if not policy_id: return None metadata = { "policy_id": policy_id, "applies_to": applies_to, "rules": chunk.content[:4000], "default_behaviors": frontmatter.get("default_behaviors") or [], "doc_kind": doc_kind, "section_path": chunk.section_path, "source_path": source.path, } return RagDocument( layer=RagLayer.DOCS_POLICY_INDEX, source=source, title=policy_id, text=chunk.content[:4000], metadata=metadata, ) def build_fact( self, source: RagSource, *, subject_id: str, predicate: str, obj: str, object_ref: str | None, anchor: str, tags: list[str] | None = None, ) -> RagDocument: fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest() metadata = { "fact_id": fact_id, "subject_id": subject_id, "predicate": predicate, "object": obj, "object_ref": object_ref, "anchor": anchor, "tags": tags or [], "source_path": source.path, } return RagDocument( layer=RagLayer.DOCS_FACT_INDEX, source=source, title=f"{subject_id}:{predicate}", text=f"{subject_id} {predicate} {obj}".strip(), metadata=metadata, links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)], )