116 lines
4.3 KiB
Python
116 lines
4.3 KiB
Python
from __future__ import annotations
|
|
|
|
from hashlib import sha256
|
|
|
|
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
|
|
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
|
|
|
|
|
|
class DocsDocumentBuilder:
|
|
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
|
|
module_id = str(frontmatter.get("id") or "").strip()
|
|
module_type = str(frontmatter.get("type") or "").strip()
|
|
domain = str(frontmatter.get("domain") or "").strip()
|
|
if not module_id or not module_type or not domain:
|
|
return None
|
|
links = frontmatter.get("links") or {}
|
|
metadata = {
|
|
"module_id": module_id,
|
|
"type": module_type,
|
|
"domain": domain,
|
|
"status": frontmatter.get("status"),
|
|
"version": frontmatter.get("version"),
|
|
"tags": frontmatter.get("tags") or [],
|
|
"owners": frontmatter.get("owners") or [],
|
|
"links": links,
|
|
"source_path": source.path,
|
|
"summary_text": summary_text[:4000],
|
|
"doc_kind": doc_kind,
|
|
}
|
|
metadata.update({name: links.get(name, []) for name in (
|
|
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
|
|
"integrates_with", "emits_events", "consumes_events",
|
|
)})
|
|
return RagDocument(
|
|
layer=RagLayer.DOCS_MODULE_CATALOG,
|
|
source=source,
|
|
title=module_id,
|
|
text=summary_text[:4000] or module_id,
|
|
metadata=metadata,
|
|
)
|
|
|
|
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
|
|
module_id = str(frontmatter.get("id") or source.path)
|
|
metadata = {
|
|
"module_id": module_id,
|
|
"type": frontmatter.get("type"),
|
|
"domain": frontmatter.get("domain"),
|
|
"tags": frontmatter.get("tags") or [],
|
|
"section_path": chunk.section_path,
|
|
"section_title": chunk.section_title,
|
|
"order": chunk.order,
|
|
"doc_kind": doc_kind,
|
|
"source_path": source.path,
|
|
"artifact_type": "DOCS",
|
|
}
|
|
return RagDocument(
|
|
layer=RagLayer.DOCS_SECTION_INDEX,
|
|
source=source,
|
|
title=f"{module_id}:{chunk.section_title}",
|
|
text=chunk.content,
|
|
metadata=metadata,
|
|
)
|
|
|
|
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
|
|
policy_id = str(frontmatter.get("id") or "").strip()
|
|
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
|
|
if not policy_id:
|
|
return None
|
|
metadata = {
|
|
"policy_id": policy_id,
|
|
"applies_to": applies_to,
|
|
"rules": chunk.content[:4000],
|
|
"default_behaviors": frontmatter.get("default_behaviors") or [],
|
|
"doc_kind": doc_kind,
|
|
"section_path": chunk.section_path,
|
|
"source_path": source.path,
|
|
}
|
|
return RagDocument(
|
|
layer=RagLayer.DOCS_POLICY_INDEX,
|
|
source=source,
|
|
title=policy_id,
|
|
text=chunk.content[:4000],
|
|
metadata=metadata,
|
|
)
|
|
|
|
def build_fact(
|
|
self,
|
|
source: RagSource,
|
|
*,
|
|
subject_id: str,
|
|
predicate: str,
|
|
obj: str,
|
|
object_ref: str | None,
|
|
anchor: str,
|
|
tags: list[str] | None = None,
|
|
) -> RagDocument:
|
|
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
|
|
metadata = {
|
|
"fact_id": fact_id,
|
|
"subject_id": subject_id,
|
|
"predicate": predicate,
|
|
"object": obj,
|
|
"object_ref": object_ref,
|
|
"anchor": anchor,
|
|
"tags": tags or [],
|
|
"source_path": source.path,
|
|
}
|
|
return RagDocument(
|
|
layer=RagLayer.DOCS_FACT_INDEX,
|
|
source=source,
|
|
title=f"{subject_id}:{predicate}",
|
|
text=f"{subject_id} {predicate} {obj}".strip(),
|
|
metadata=metadata,
|
|
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
|
|
)
|