Files
agent/app/modules/rag/contracts/documents.py
2026-03-01 14:21:33 +03:00

78 lines
2.1 KiB
Python

from __future__ import annotations
from dataclasses import asdict, dataclass, field
from hashlib import sha256
from app.modules.rag.contracts.evidence import EvidenceLink
@dataclass(slots=True)
class RagSource:
repo_id: str
commit_sha: str | None
path: str
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagSpan:
start_line: int | None = None
end_line: int | None = None
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagDocument:
layer: str
source: RagSource
title: str
text: str
metadata: dict = field(default_factory=dict)
links: list[EvidenceLink] = field(default_factory=list)
span: RagSpan | None = None
doc_id: str | None = None
lang: str | None = None
embedding: list[float] | None = None
def ensure_doc_id(self) -> str:
if self.doc_id:
return self.doc_id
span_key = ""
if self.span is not None:
span_key = f":{self.span.start_line}:{self.span.end_line}"
raw = "|".join(
[
self.layer,
self.lang or "",
self.source.repo_id,
self.source.commit_sha or "",
self.source.path,
self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""),
self.title,
span_key,
]
)
self.doc_id = sha256(raw.encode("utf-8")).hexdigest()
return self.doc_id
def to_record(self) -> dict:
return {
"doc_id": self.ensure_doc_id(),
"layer": self.layer,
"lang": self.lang,
"repo_id": self.source.repo_id,
"commit_sha": self.source.commit_sha,
"path": self.source.path,
"title": self.title,
"text": self.text,
"metadata": dict(self.metadata),
"links": [link.to_dict() for link in self.links],
"span_start": self.span.start_line if self.span else None,
"span_end": self.span.end_line if self.span else None,
"embedding": self.embedding or [],
}