78 lines
2.1 KiB
Python
78 lines
2.1 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import asdict, dataclass, field
|
|
from hashlib import sha256
|
|
|
|
from app.modules.rag.contracts.evidence import EvidenceLink
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class RagSource:
|
|
repo_id: str
|
|
commit_sha: str | None
|
|
path: str
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class RagSpan:
|
|
start_line: int | None = None
|
|
end_line: int | None = None
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class RagDocument:
|
|
layer: str
|
|
source: RagSource
|
|
title: str
|
|
text: str
|
|
metadata: dict = field(default_factory=dict)
|
|
links: list[EvidenceLink] = field(default_factory=list)
|
|
span: RagSpan | None = None
|
|
doc_id: str | None = None
|
|
lang: str | None = None
|
|
embedding: list[float] | None = None
|
|
|
|
def ensure_doc_id(self) -> str:
|
|
if self.doc_id:
|
|
return self.doc_id
|
|
span_key = ""
|
|
if self.span is not None:
|
|
span_key = f":{self.span.start_line}:{self.span.end_line}"
|
|
raw = "|".join(
|
|
[
|
|
self.layer,
|
|
self.lang or "",
|
|
self.source.repo_id,
|
|
self.source.commit_sha or "",
|
|
self.source.path,
|
|
self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""),
|
|
self.title,
|
|
span_key,
|
|
]
|
|
)
|
|
self.doc_id = sha256(raw.encode("utf-8")).hexdigest()
|
|
return self.doc_id
|
|
|
|
def to_record(self) -> dict:
|
|
return {
|
|
"doc_id": self.ensure_doc_id(),
|
|
"layer": self.layer,
|
|
"lang": self.lang,
|
|
"repo_id": self.source.repo_id,
|
|
"commit_sha": self.source.commit_sha,
|
|
"path": self.source.path,
|
|
"title": self.title,
|
|
"text": self.text,
|
|
"metadata": dict(self.metadata),
|
|
"links": [link.to_dict() for link in self.links],
|
|
"span_start": self.span.start_line if self.span else None,
|
|
"span_end": self.span.end_line if self.span else None,
|
|
"embedding": self.embedding or [],
|
|
}
|