from __future__ import annotations from dataclasses import asdict, dataclass, field from hashlib import sha256 from app.modules.rag.contracts.evidence import EvidenceLink @dataclass(slots=True) class RagSource: repo_id: str commit_sha: str | None path: str def to_dict(self) -> dict: return asdict(self) @dataclass(slots=True) class RagSpan: start_line: int | None = None end_line: int | None = None def to_dict(self) -> dict: return asdict(self) @dataclass(slots=True) class RagDocument: layer: str source: RagSource title: str text: str metadata: dict = field(default_factory=dict) links: list[EvidenceLink] = field(default_factory=list) span: RagSpan | None = None doc_id: str | None = None lang: str | None = None embedding: list[float] | None = None def ensure_doc_id(self) -> str: if self.doc_id: return self.doc_id span_key = "" if self.span is not None: span_key = f":{self.span.start_line}:{self.span.end_line}" raw = "|".join( [ self.layer, self.lang or "", self.source.repo_id, self.source.commit_sha or "", self.source.path, self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""), self.title, span_key, ] ) self.doc_id = sha256(raw.encode("utf-8")).hexdigest() return self.doc_id def to_record(self) -> dict: return { "doc_id": self.ensure_doc_id(), "layer": self.layer, "lang": self.lang, "repo_id": self.source.repo_id, "commit_sha": self.source.commit_sha, "path": self.source.path, "title": self.title, "text": self.text, "metadata": dict(self.metadata), "links": [link.to_dict() for link in self.links], "span_start": self.span.start_line if self.span else None, "span_end": self.span.end_line if self.span else None, "embedding": self.embedding or [], }