Новый раг

2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions
--- a/app/modules/rag/indexing/docs/pycache/classifier.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/classifier.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/document_builder.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/document_builder.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/file_filter.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/file_filter.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/pycache/pipeline.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/pycache/pipeline.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
+
+
+@dataclass(slots=True)
+class SectionChunk:
+    section_path: str
+    section_title: str
+    content: str
+    order: int
+
+
+class MarkdownDocChunker:
+    def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
+        self._fallback = text_chunker or DocTextChunker()
+
+    def chunk(self, text: str) -> list[SectionChunk]:
+        lines = text.splitlines()
+        sections: list[SectionChunk] = []
+        stack: list[tuple[int, str]] = []
+        current_title = "Document"
+        current_lines: list[str] = []
+        order = 0
+        for line in lines:
+            heading = self._heading(line)
+            if heading is None:
+                current_lines.append(line)
+                continue
+            self._flush_section(sections, stack, current_title, current_lines, order)
+            order += 1
+            level, title = heading
+            stack = [item for item in stack if item[0] < level]
+            stack.append((level, title))
+            current_title = title
+            current_lines = []
+        self._flush_section(sections, stack, current_title, current_lines, order)
+        if sections:
+            return sections
+        chunks = self._fallback.split(text)
+        return [
+            SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
+            for index, chunk in enumerate(chunks)
+        ]
+
+    def _flush_section(
+        self,
+        sections: list[SectionChunk],
+        stack: list[tuple[int, str]],
+        current_title: str,
+        current_lines: list[str],
+        order: int,
+    ) -> None:
+        content = "\n".join(current_lines).strip()
+        if not content:
+            return
+        titles = [title for _, title in stack] or [current_title]
+        sections.append(
+            SectionChunk(
+                section_path=" > ".join(titles),
+                section_title=titles[-1],
+                content=content,
+                order=order,
+            )
+        )
+
+    def _heading(self, line: str) -> tuple[int, str] | None:
+        stripped = line.strip()
+        if not stripped.startswith("#"):
+            return None
+        level = len(stripped) - len(stripped.lstrip("#"))
+        title = stripped[level:].strip()
+        if not title:
+            return None
+        return level, title
--- a/app/modules/rag/indexing/docs/chunkers/text_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/text_chunker.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+
+class DocTextChunker:
+    def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
+        self._max_chars = max_chars
+        self._overlap_chars = overlap_chars
+
+    def split(self, text: str) -> list[str]:
+        cleaned = text.strip()
+        if not cleaned:
+            return []
+        chunks: list[str] = []
+        start = 0
+        while start < len(cleaned):
+            end = min(len(cleaned), start + self._max_chars)
+            chunks.append(cleaned[start:end].strip())
+            if end >= len(cleaned):
+                break
+            start = max(0, end - self._overlap_chars)
+        return [chunk for chunk in chunks if chunk]
--- a/app/modules/rag/indexing/docs/classifier.py
+++ b/app/modules/rag/indexing/docs/classifier.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+
+from app.modules.rag.contracts import DocKind
+
+
+class DocsClassifier:
+    def classify(self, path: str) -> str:
+        upper = PurePosixPath(path).name.upper()
+        lowered = path.lower()
+        if "runbook" in lowered or upper.startswith("RUNBOOK"):
+            return DocKind.RUNBOOK
+        if upper.startswith("README"):
+            return DocKind.README
+        if "spec" in lowered or "architecture" in lowered:
+            return DocKind.SPEC
+        return DocKind.MISC
--- a/app/modules/rag/indexing/docs/document_builder.py
+++ b/app/modules/rag/indexing/docs/document_builder.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from hashlib import sha256
+
+from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
+from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
+
+
+class DocsDocumentBuilder:
+    def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
+        module_id = str(frontmatter.get("id") or "").strip()
+        module_type = str(frontmatter.get("type") or "").strip()
+        domain = str(frontmatter.get("domain") or "").strip()
+        if not module_id or not module_type or not domain:
+            return None
+        links = frontmatter.get("links") or {}
+        metadata = {
+            "module_id": module_id,
+            "type": module_type,
+            "domain": domain,
+            "status": frontmatter.get("status"),
+            "version": frontmatter.get("version"),
+            "tags": frontmatter.get("tags") or [],
+            "owners": frontmatter.get("owners") or [],
+            "links": links,
+            "source_path": source.path,
+            "summary_text": summary_text[:4000],
+            "doc_kind": doc_kind,
+        }
+        metadata.update({name: links.get(name, []) for name in (
+            "calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
+            "integrates_with", "emits_events", "consumes_events",
+        )})
+        return RagDocument(
+            layer=RagLayer.DOCS_MODULE_CATALOG,
+            source=source,
+            title=module_id,
+            text=summary_text[:4000] or module_id,
+            metadata=metadata,
+        )
+
+    def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
+        module_id = str(frontmatter.get("id") or source.path)
+        metadata = {
+            "module_id": module_id,
+            "type": frontmatter.get("type"),
+            "domain": frontmatter.get("domain"),
+            "tags": frontmatter.get("tags") or [],
+            "section_path": chunk.section_path,
+            "section_title": chunk.section_title,
+            "order": chunk.order,
+            "doc_kind": doc_kind,
+            "source_path": source.path,
+            "artifact_type": "DOCS",
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_SECTION_INDEX,
+            source=source,
+            title=f"{module_id}:{chunk.section_title}",
+            text=chunk.content,
+            metadata=metadata,
+        )
+
+    def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
+        policy_id = str(frontmatter.get("id") or "").strip()
+        applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
+        if not policy_id:
+            return None
+        metadata = {
+            "policy_id": policy_id,
+            "applies_to": applies_to,
+            "rules": chunk.content[:4000],
+            "default_behaviors": frontmatter.get("default_behaviors") or [],
+            "doc_kind": doc_kind,
+            "section_path": chunk.section_path,
+            "source_path": source.path,
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_POLICY_INDEX,
+            source=source,
+            title=policy_id,
+            text=chunk.content[:4000],
+            metadata=metadata,
+        )
+
+    def build_fact(
+        self,
+        source: RagSource,
+        *,
+        subject_id: str,
+        predicate: str,
+        obj: str,
+        object_ref: str | None,
+        anchor: str,
+        tags: list[str] | None = None,
+    ) -> RagDocument:
+        fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
+        metadata = {
+            "fact_id": fact_id,
+            "subject_id": subject_id,
+            "predicate": predicate,
+            "object": obj,
+            "object_ref": object_ref,
+            "anchor": anchor,
+            "tags": tags or [],
+            "source_path": source.path,
+        }
+        return RagDocument(
+            layer=RagLayer.DOCS_FACT_INDEX,
+            source=source,
+            title=f"{subject_id}:{predicate}",
+            text=f"{subject_id} {predicate} {obj}".strip(),
+            metadata=metadata,
+            links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
+        )
--- a/app/modules/rag/indexing/docs/file_filter.py
+++ b/app/modules/rag/indexing/docs/file_filter.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+
+
+class DocsFileFilter:
+    _EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
+    _NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
+    _EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
+
+    def should_index(self, path: str) -> bool:
+        candidate = PurePosixPath(path)
+        if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
+            return False
+        if candidate.suffix.lower() in self._EXTENSIONS:
+            return True
+        upper_name = candidate.name.upper()
+        if any(upper_name.startswith(prefix) for prefix in self._NAMES):
+            return True
+        joined = "/".join(candidate.parts).lower()
+        return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))
--- a/app/modules/rag/indexing/docs/pipeline.py
+++ b/app/modules/rag/indexing/docs/pipeline.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import re
+
+import yaml
+
+from app.modules.rag.contracts import RagDocument, RagSource
+from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
+from app.modules.rag.indexing.docs.classifier import DocsClassifier
+from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
+from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
+
+
+class DocsIndexingPipeline:
+    def __init__(self) -> None:
+        self._filter = DocsFileFilter()
+        self._classifier = DocsClassifier()
+        self._chunker = MarkdownDocChunker()
+        self._builder = DocsDocumentBuilder()
+
+    def supports(self, path: str) -> bool:
+        return self._filter.should_index(path)
+
+    def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
+        source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
+        frontmatter, body = self._split_frontmatter(content)
+        doc_kind = self._classifier.classify(path)
+        sections = self._chunker.chunk(body)
+        summary_text = self._summary_from_sections(sections)
+        docs: list[RagDocument] = []
+        module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
+        if module_doc is not None:
+            docs.append(module_doc)
+        for section in sections:
+            docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
+        if str(frontmatter.get("type") or "").strip() == "policy":
+            for section in sections[:1]:
+                policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
+                if policy is not None:
+                    docs.append(policy)
+        docs.extend(self._extract_facts(source, frontmatter, sections))
+        return docs
+
+    def _split_frontmatter(self, content: str) -> tuple[dict, str]:
+        if not content.startswith("---\n"):
+            return {}, content
+        _, raw, body = content.split("---", 2)
+        payload = yaml.safe_load(raw) or {}
+        return payload if isinstance(payload, dict) else {}, body.strip()
+
+    def _summary_from_sections(self, sections) -> str:
+        text = "\n\n".join(section.content for section in sections[:2]).strip()
+        return text[:4000]
+
+    def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
+        subject_id = str(frontmatter.get("id") or source.path)
+        docs: list[RagDocument] = []
+        links = frontmatter.get("links") or {}
+        for predicate, values in links.items():
+            for value in values or []:
+                docs.append(
+                    self._builder.build_fact(
+                        source,
+                        subject_id=subject_id,
+                        predicate=predicate,
+                        obj=str(value),
+                        object_ref=str(value),
+                        anchor="frontmatter.links",
+                    )
+                )
+        for section in sections:
+            docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
+            docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
+        return docs
+
+    def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
+        lines = [line.strip() for line in content.splitlines() if line.strip()]
+        if len(lines) < 3 or "|" not in lines[0]:
+            return []
+        headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
+        if not all(headers):
+            return []
+        docs: list[RagDocument] = []
+        for row in lines[2:]:
+            if "|" not in row:
+                continue
+            values = [part.strip() for part in row.strip("|").split("|")]
+            if len(values) != len(headers):
+                continue
+            payload = dict(zip(headers, values))
+            docs.extend(self._facts_from_row(source, subject_id, title, payload))
+        return docs
+
+    def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
+        docs: list[RagDocument] = []
+        name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
+        if "request" in title.lower() or "response" in title.lower():
+            if name:
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
+            if payload.get("required"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
+            if payload.get("type"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
+            if payload.get("validation"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
+        if "error" in title.lower():
+            if payload.get("status"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
+            if payload.get("error") or payload.get("code"):
+                error_value = payload.get("error") or payload.get("code")
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
+            if payload.get("client action"):
+                docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
+        if "constraint" in title.lower() and name:
+            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
+        return docs
+
+    def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
+        docs: list[RagDocument] = []
+        for line in content.splitlines():
+            item = line.strip()
+            if not item.startswith(("-", "*", "1.", "2.", "3.")):
+                continue
+            normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
+            lowered = normalized.lower()
+            if lowered.startswith("metric:"):
+                predicate = "emits_metric"
+            elif lowered.startswith("event:"):
+                predicate = "emits_analytics_event"
+            elif lowered.startswith("log:"):
+                predicate = "logs_event"
+            else:
+                predicate = "validates_rule" if "rule" in title.lower() else "client_action"
+            docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
+        return docs