Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import dataclass
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
@dataclass(slots=True)
class SectionChunk:
section_path: str
section_title: str
content: str
order: int
class MarkdownDocChunker:
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
self._fallback = text_chunker or DocTextChunker()
def chunk(self, text: str) -> list[SectionChunk]:
lines = text.splitlines()
sections: list[SectionChunk] = []
stack: list[tuple[int, str]] = []
current_title = "Document"
current_lines: list[str] = []
order = 0
for line in lines:
heading = self._heading(line)
if heading is None:
current_lines.append(line)
continue
self._flush_section(sections, stack, current_title, current_lines, order)
order += 1
level, title = heading
stack = [item for item in stack if item[0] < level]
stack.append((level, title))
current_title = title
current_lines = []
self._flush_section(sections, stack, current_title, current_lines, order)
if sections:
return sections
chunks = self._fallback.split(text)
return [
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
for index, chunk in enumerate(chunks)
]
def _flush_section(
self,
sections: list[SectionChunk],
stack: list[tuple[int, str]],
current_title: str,
current_lines: list[str],
order: int,
) -> None:
content = "\n".join(current_lines).strip()
if not content:
return
titles = [title for _, title in stack] or [current_title]
sections.append(
SectionChunk(
section_path=" > ".join(titles),
section_title=titles[-1],
content=content,
order=order,
)
)
def _heading(self, line: str) -> tuple[int, str] | None:
stripped = line.strip()
if not stripped.startswith("#"):
return None
level = len(stripped) - len(stripped.lstrip("#"))
title = stripped[level:].strip()
if not title:
return None
return level, title

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
class DocTextChunker:
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
self._max_chars = max_chars
self._overlap_chars = overlap_chars
def split(self, text: str) -> list[str]:
cleaned = text.strip()
if not cleaned:
return []
chunks: list[str] = []
start = 0
while start < len(cleaned):
end = min(len(cleaned), start + self._max_chars)
chunks.append(cleaned[start:end].strip())
if end >= len(cleaned):
break
start = max(0, end - self._overlap_chars)
return [chunk for chunk in chunks if chunk]

View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from pathlib import PurePosixPath
from app.modules.rag.contracts import DocKind
class DocsClassifier:
def classify(self, path: str) -> str:
upper = PurePosixPath(path).name.upper()
lowered = path.lower()
if "runbook" in lowered or upper.startswith("RUNBOOK"):
return DocKind.RUNBOOK
if upper.startswith("README"):
return DocKind.README
if "spec" in lowered or "architecture" in lowered:
return DocKind.SPEC
return DocKind.MISC

View File

@@ -0,0 +1,115 @@
from __future__ import annotations
from hashlib import sha256
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
class DocsDocumentBuilder:
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
module_id = str(frontmatter.get("id") or "").strip()
module_type = str(frontmatter.get("type") or "").strip()
domain = str(frontmatter.get("domain") or "").strip()
if not module_id or not module_type or not domain:
return None
links = frontmatter.get("links") or {}
metadata = {
"module_id": module_id,
"type": module_type,
"domain": domain,
"status": frontmatter.get("status"),
"version": frontmatter.get("version"),
"tags": frontmatter.get("tags") or [],
"owners": frontmatter.get("owners") or [],
"links": links,
"source_path": source.path,
"summary_text": summary_text[:4000],
"doc_kind": doc_kind,
}
metadata.update({name: links.get(name, []) for name in (
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
"integrates_with", "emits_events", "consumes_events",
)})
return RagDocument(
layer=RagLayer.DOCS_MODULE_CATALOG,
source=source,
title=module_id,
text=summary_text[:4000] or module_id,
metadata=metadata,
)
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
module_id = str(frontmatter.get("id") or source.path)
metadata = {
"module_id": module_id,
"type": frontmatter.get("type"),
"domain": frontmatter.get("domain"),
"tags": frontmatter.get("tags") or [],
"section_path": chunk.section_path,
"section_title": chunk.section_title,
"order": chunk.order,
"doc_kind": doc_kind,
"source_path": source.path,
"artifact_type": "DOCS",
}
return RagDocument(
layer=RagLayer.DOCS_SECTION_INDEX,
source=source,
title=f"{module_id}:{chunk.section_title}",
text=chunk.content,
metadata=metadata,
)
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
policy_id = str(frontmatter.get("id") or "").strip()
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
if not policy_id:
return None
metadata = {
"policy_id": policy_id,
"applies_to": applies_to,
"rules": chunk.content[:4000],
"default_behaviors": frontmatter.get("default_behaviors") or [],
"doc_kind": doc_kind,
"section_path": chunk.section_path,
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_POLICY_INDEX,
source=source,
title=policy_id,
text=chunk.content[:4000],
metadata=metadata,
)
def build_fact(
self,
source: RagSource,
*,
subject_id: str,
predicate: str,
obj: str,
object_ref: str | None,
anchor: str,
tags: list[str] | None = None,
) -> RagDocument:
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
metadata = {
"fact_id": fact_id,
"subject_id": subject_id,
"predicate": predicate,
"object": obj,
"object_ref": object_ref,
"anchor": anchor,
"tags": tags or [],
"source_path": source.path,
}
return RagDocument(
layer=RagLayer.DOCS_FACT_INDEX,
source=source,
title=f"{subject_id}:{predicate}",
text=f"{subject_id} {predicate} {obj}".strip(),
metadata=metadata,
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
)

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from pathlib import PurePosixPath
class DocsFileFilter:
_EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
_NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
_EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
def should_index(self, path: str) -> bool:
candidate = PurePosixPath(path)
if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
return False
if candidate.suffix.lower() in self._EXTENSIONS:
return True
upper_name = candidate.name.upper()
if any(upper_name.startswith(prefix) for prefix in self._NAMES):
return True
joined = "/".join(candidate.parts).lower()
return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))

View File

@@ -0,0 +1,135 @@
from __future__ import annotations
import re
import yaml
from app.modules.rag.contracts import RagDocument, RagSource
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.modules.rag.indexing.docs.classifier import DocsClassifier
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
class DocsIndexingPipeline:
def __init__(self) -> None:
self._filter = DocsFileFilter()
self._classifier = DocsClassifier()
self._chunker = MarkdownDocChunker()
self._builder = DocsDocumentBuilder()
def supports(self, path: str) -> bool:
return self._filter.should_index(path)
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
frontmatter, body = self._split_frontmatter(content)
doc_kind = self._classifier.classify(path)
sections = self._chunker.chunk(body)
summary_text = self._summary_from_sections(sections)
docs: list[RagDocument] = []
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
if module_doc is not None:
docs.append(module_doc)
for section in sections:
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
if str(frontmatter.get("type") or "").strip() == "policy":
for section in sections[:1]:
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
if policy is not None:
docs.append(policy)
docs.extend(self._extract_facts(source, frontmatter, sections))
return docs
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
if not content.startswith("---\n"):
return {}, content
_, raw, body = content.split("---", 2)
payload = yaml.safe_load(raw) or {}
return payload if isinstance(payload, dict) else {}, body.strip()
def _summary_from_sections(self, sections) -> str:
text = "\n\n".join(section.content for section in sections[:2]).strip()
return text[:4000]
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
subject_id = str(frontmatter.get("id") or source.path)
docs: list[RagDocument] = []
links = frontmatter.get("links") or {}
for predicate, values in links.items():
for value in values or []:
docs.append(
self._builder.build_fact(
source,
subject_id=subject_id,
predicate=predicate,
obj=str(value),
object_ref=str(value),
anchor="frontmatter.links",
)
)
for section in sections:
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
return docs
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
lines = [line.strip() for line in content.splitlines() if line.strip()]
if len(lines) < 3 or "|" not in lines[0]:
return []
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
if not all(headers):
return []
docs: list[RagDocument] = []
for row in lines[2:]:
if "|" not in row:
continue
values = [part.strip() for part in row.strip("|").split("|")]
if len(values) != len(headers):
continue
payload = dict(zip(headers, values))
docs.extend(self._facts_from_row(source, subject_id, title, payload))
return docs
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
docs: list[RagDocument] = []
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
if "request" in title.lower() or "response" in title.lower():
if name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
if payload.get("required"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
if payload.get("type"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
if payload.get("validation"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
if "error" in title.lower():
if payload.get("status"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
if payload.get("error") or payload.get("code"):
error_value = payload.get("error") or payload.get("code")
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
if payload.get("client action"):
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
if "constraint" in title.lower() and name:
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
return docs
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
docs: list[RagDocument] = []
for line in content.splitlines():
item = line.strip()
if not item.startswith(("-", "*", "1.", "2.", "3.")):
continue
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
lowered = normalized.lower()
if lowered.startswith("metric:"):
predicate = "emits_metric"
elif lowered.startswith("event:"):
predicate = "emits_analytics_event"
elif lowered.startswith("log:"):
predicate = "logs_event"
else:
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
return docs