Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -0,0 +1,17 @@
from app.modules.rag.contracts.documents import RagDocument, RagSource, RagSpan
from app.modules.rag.contracts.enums import DocKind, EvidenceType, RagLayer, RetrievalMode
from app.modules.rag.contracts.evidence import EvidenceLink
from app.modules.rag.contracts.retrieval import RetrievalItem, RetrievalQuery
__all__ = [
"DocKind",
"EvidenceLink",
"EvidenceType",
"RagDocument",
"RagLayer",
"RagSource",
"RagSpan",
"RetrievalItem",
"RetrievalMode",
"RetrievalQuery",
]

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from hashlib import sha256
from app.modules.rag.contracts.evidence import EvidenceLink
@dataclass(slots=True)
class RagSource:
repo_id: str
commit_sha: str | None
path: str
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagSpan:
start_line: int | None = None
end_line: int | None = None
def to_dict(self) -> dict:
return asdict(self)
@dataclass(slots=True)
class RagDocument:
layer: str
source: RagSource
title: str
text: str
metadata: dict = field(default_factory=dict)
links: list[EvidenceLink] = field(default_factory=list)
span: RagSpan | None = None
doc_id: str | None = None
lang: str | None = None
embedding: list[float] | None = None
def ensure_doc_id(self) -> str:
if self.doc_id:
return self.doc_id
span_key = ""
if self.span is not None:
span_key = f":{self.span.start_line}:{self.span.end_line}"
raw = "|".join(
[
self.layer,
self.lang or "",
self.source.repo_id,
self.source.commit_sha or "",
self.source.path,
self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""),
self.title,
span_key,
]
)
self.doc_id = sha256(raw.encode("utf-8")).hexdigest()
return self.doc_id
def to_record(self) -> dict:
return {
"doc_id": self.ensure_doc_id(),
"layer": self.layer,
"lang": self.lang,
"repo_id": self.source.repo_id,
"commit_sha": self.source.commit_sha,
"path": self.source.path,
"title": self.title,
"text": self.text,
"metadata": dict(self.metadata),
"links": [link.to_dict() for link in self.links],
"span_start": self.span.start_line if self.span else None,
"span_end": self.span.end_line if self.span else None,
"embedding": self.embedding or [],
}

View File

@@ -0,0 +1,35 @@
from __future__ import annotations
class RagLayer:
DOCS_MODULE_CATALOG = "D1_MODULE_CATALOG"
DOCS_FACT_INDEX = "D2_FACT_INDEX"
DOCS_SECTION_INDEX = "D3_SECTION_INDEX"
DOCS_POLICY_INDEX = "D4_POLICY_INDEX"
CODE_SOURCE_CHUNKS = "C0_SOURCE_CHUNKS"
CODE_SYMBOL_CATALOG = "C1_SYMBOL_CATALOG"
CODE_DEPENDENCY_GRAPH = "C2_DEPENDENCY_GRAPH"
CODE_ENTRYPOINTS = "C3_ENTRYPOINTS"
CODE_PUBLIC_API = "C4_PUBLIC_API"
CODE_BEHAVIOR_SUMMARIES = "C5_BEHAVIOR_SUMMARIES"
CODE_RUNTIME_TRACES = "C6_RUNTIME_TRACES"
class RetrievalMode:
DOCS = "docs"
CODE = "code"
class DocKind:
SPEC = "spec"
RUNBOOK = "runbook"
README = "readme"
MISC = "misc"
class EvidenceType:
CODE_SPAN = "code_span"
SYMBOL = "symbol"
EDGE = "edge"
DOC_SECTION = "doc_section"
DOC_FACT = "doc_fact"

View File

@@ -0,0 +1,16 @@
from __future__ import annotations
from dataclasses import asdict, dataclass
@dataclass(slots=True)
class EvidenceLink:
type: str
target_id: str
path: str | None = None
start_line: int | None = None
end_line: int | None = None
note: str | None = None
def to_dict(self) -> dict:
return asdict(self)

View File

@@ -0,0 +1,23 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(slots=True)
class RetrievalQuery:
text: str
mode: str
limit: int = 5
layers: list[str] = field(default_factory=list)
path_prefixes: list[str] = field(default_factory=list)
doc_kind: str | None = None
@dataclass(slots=True)
class RetrievalItem:
content: str
path: str
layer: str
title: str
score: float | None = None
metadata: dict | None = None