Новый раг

2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions
--- a/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/markdown_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
+++ b/app/modules/rag/indexing/docs/chunkers/pycache/text_chunker.cpython-312.pyc
--- a/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
+
+
+@dataclass(slots=True)
+class SectionChunk:
+    section_path: str
+    section_title: str
+    content: str
+    order: int
+
+
+class MarkdownDocChunker:
+    def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
+        self._fallback = text_chunker or DocTextChunker()
+
+    def chunk(self, text: str) -> list[SectionChunk]:
+        lines = text.splitlines()
+        sections: list[SectionChunk] = []
+        stack: list[tuple[int, str]] = []
+        current_title = "Document"
+        current_lines: list[str] = []
+        order = 0
+        for line in lines:
+            heading = self._heading(line)
+            if heading is None:
+                current_lines.append(line)
+                continue
+            self._flush_section(sections, stack, current_title, current_lines, order)
+            order += 1
+            level, title = heading
+            stack = [item for item in stack if item[0] < level]
+            stack.append((level, title))
+            current_title = title
+            current_lines = []
+        self._flush_section(sections, stack, current_title, current_lines, order)
+        if sections:
+            return sections
+        chunks = self._fallback.split(text)
+        return [
+            SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
+            for index, chunk in enumerate(chunks)
+        ]
+
+    def _flush_section(
+        self,
+        sections: list[SectionChunk],
+        stack: list[tuple[int, str]],
+        current_title: str,
+        current_lines: list[str],
+        order: int,
+    ) -> None:
+        content = "\n".join(current_lines).strip()
+        if not content:
+            return
+        titles = [title for _, title in stack] or [current_title]
+        sections.append(
+            SectionChunk(
+                section_path=" > ".join(titles),
+                section_title=titles[-1],
+                content=content,
+                order=order,
+            )
+        )
+
+    def _heading(self, line: str) -> tuple[int, str] | None:
+        stripped = line.strip()
+        if not stripped.startswith("#"):
+            return None
+        level = len(stripped) - len(stripped.lstrip("#"))
+        title = stripped[level:].strip()
+        if not title:
+            return None
+        return level, title
--- a/app/modules/rag/indexing/docs/chunkers/text_chunker.py
+++ b/app/modules/rag/indexing/docs/chunkers/text_chunker.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+
+class DocTextChunker:
+    def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
+        self._max_chars = max_chars
+        self._overlap_chars = overlap_chars
+
+    def split(self, text: str) -> list[str]:
+        cleaned = text.strip()
+        if not cleaned:
+            return []
+        chunks: list[str] = []
+        start = 0
+        while start < len(cleaned):
+            end = min(len(cleaned), start + self._max_chars)
+            chunks.append(cleaned[start:end].strip())
+            if end >= len(cleaned):
+                break
+            start = max(0, end - self._overlap_chars)
+        return [chunk for chunk in chunks if chunk]