Новый раг

This commit is contained in:
2026-03-01 14:21:33 +03:00
parent 2728c07ba9
commit 1ef0b4d68c
95 changed files with 3145 additions and 927 deletions

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
from dataclasses import dataclass
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
@dataclass(slots=True)
class SectionChunk:
section_path: str
section_title: str
content: str
order: int
class MarkdownDocChunker:
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
self._fallback = text_chunker or DocTextChunker()
def chunk(self, text: str) -> list[SectionChunk]:
lines = text.splitlines()
sections: list[SectionChunk] = []
stack: list[tuple[int, str]] = []
current_title = "Document"
current_lines: list[str] = []
order = 0
for line in lines:
heading = self._heading(line)
if heading is None:
current_lines.append(line)
continue
self._flush_section(sections, stack, current_title, current_lines, order)
order += 1
level, title = heading
stack = [item for item in stack if item[0] < level]
stack.append((level, title))
current_title = title
current_lines = []
self._flush_section(sections, stack, current_title, current_lines, order)
if sections:
return sections
chunks = self._fallback.split(text)
return [
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
for index, chunk in enumerate(chunks)
]
def _flush_section(
self,
sections: list[SectionChunk],
stack: list[tuple[int, str]],
current_title: str,
current_lines: list[str],
order: int,
) -> None:
content = "\n".join(current_lines).strip()
if not content:
return
titles = [title for _, title in stack] or [current_title]
sections.append(
SectionChunk(
section_path=" > ".join(titles),
section_title=titles[-1],
content=content,
order=order,
)
)
def _heading(self, line: str) -> tuple[int, str] | None:
stripped = line.strip()
if not stripped.startswith("#"):
return None
level = len(stripped) - len(stripped.lstrip("#"))
title = stripped[level:].strip()
if not title:
return None
return level, title

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
class DocTextChunker:
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
self._max_chars = max_chars
self._overlap_chars = overlap_chars
def split(self, text: str) -> list[str]:
cleaned = text.strip()
if not cleaned:
return []
chunks: list[str] = []
start = 0
while start < len(cleaned):
end = min(len(cleaned), start + self._max_chars)
chunks.append(cleaned[start:end].strip())
if end >= len(cleaned):
break
start = max(0, end - self._overlap_chars)
return [chunk for chunk in chunks if chunk]