Новый раг
This commit is contained in:
Binary file not shown.
Binary file not shown.
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SectionChunk:
|
||||
section_path: str
|
||||
section_title: str
|
||||
content: str
|
||||
order: int
|
||||
|
||||
|
||||
class MarkdownDocChunker:
|
||||
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
|
||||
self._fallback = text_chunker or DocTextChunker()
|
||||
|
||||
def chunk(self, text: str) -> list[SectionChunk]:
|
||||
lines = text.splitlines()
|
||||
sections: list[SectionChunk] = []
|
||||
stack: list[tuple[int, str]] = []
|
||||
current_title = "Document"
|
||||
current_lines: list[str] = []
|
||||
order = 0
|
||||
for line in lines:
|
||||
heading = self._heading(line)
|
||||
if heading is None:
|
||||
current_lines.append(line)
|
||||
continue
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
order += 1
|
||||
level, title = heading
|
||||
stack = [item for item in stack if item[0] < level]
|
||||
stack.append((level, title))
|
||||
current_title = title
|
||||
current_lines = []
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
if sections:
|
||||
return sections
|
||||
chunks = self._fallback.split(text)
|
||||
return [
|
||||
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
|
||||
for index, chunk in enumerate(chunks)
|
||||
]
|
||||
|
||||
def _flush_section(
|
||||
self,
|
||||
sections: list[SectionChunk],
|
||||
stack: list[tuple[int, str]],
|
||||
current_title: str,
|
||||
current_lines: list[str],
|
||||
order: int,
|
||||
) -> None:
|
||||
content = "\n".join(current_lines).strip()
|
||||
if not content:
|
||||
return
|
||||
titles = [title for _, title in stack] or [current_title]
|
||||
sections.append(
|
||||
SectionChunk(
|
||||
section_path=" > ".join(titles),
|
||||
section_title=titles[-1],
|
||||
content=content,
|
||||
order=order,
|
||||
)
|
||||
)
|
||||
|
||||
def _heading(self, line: str) -> tuple[int, str] | None:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("#"):
|
||||
return None
|
||||
level = len(stripped) - len(stripped.lstrip("#"))
|
||||
title = stripped[level:].strip()
|
||||
if not title:
|
||||
return None
|
||||
return level, title
|
||||
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class DocTextChunker:
|
||||
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
|
||||
self._max_chars = max_chars
|
||||
self._overlap_chars = overlap_chars
|
||||
|
||||
def split(self, text: str) -> list[str]:
|
||||
cleaned = text.strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(len(cleaned), start + self._max_chars)
|
||||
chunks.append(cleaned[start:end].strip())
|
||||
if end >= len(cleaned):
|
||||
break
|
||||
start = max(0, end - self._overlap_chars)
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
Reference in New Issue
Block a user