Новый раг
This commit is contained in:
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SectionChunk:
|
||||
section_path: str
|
||||
section_title: str
|
||||
content: str
|
||||
order: int
|
||||
|
||||
|
||||
class MarkdownDocChunker:
|
||||
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
|
||||
self._fallback = text_chunker or DocTextChunker()
|
||||
|
||||
def chunk(self, text: str) -> list[SectionChunk]:
|
||||
lines = text.splitlines()
|
||||
sections: list[SectionChunk] = []
|
||||
stack: list[tuple[int, str]] = []
|
||||
current_title = "Document"
|
||||
current_lines: list[str] = []
|
||||
order = 0
|
||||
for line in lines:
|
||||
heading = self._heading(line)
|
||||
if heading is None:
|
||||
current_lines.append(line)
|
||||
continue
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
order += 1
|
||||
level, title = heading
|
||||
stack = [item for item in stack if item[0] < level]
|
||||
stack.append((level, title))
|
||||
current_title = title
|
||||
current_lines = []
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
if sections:
|
||||
return sections
|
||||
chunks = self._fallback.split(text)
|
||||
return [
|
||||
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
|
||||
for index, chunk in enumerate(chunks)
|
||||
]
|
||||
|
||||
def _flush_section(
|
||||
self,
|
||||
sections: list[SectionChunk],
|
||||
stack: list[tuple[int, str]],
|
||||
current_title: str,
|
||||
current_lines: list[str],
|
||||
order: int,
|
||||
) -> None:
|
||||
content = "\n".join(current_lines).strip()
|
||||
if not content:
|
||||
return
|
||||
titles = [title for _, title in stack] or [current_title]
|
||||
sections.append(
|
||||
SectionChunk(
|
||||
section_path=" > ".join(titles),
|
||||
section_title=titles[-1],
|
||||
content=content,
|
||||
order=order,
|
||||
)
|
||||
)
|
||||
|
||||
def _heading(self, line: str) -> tuple[int, str] | None:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("#"):
|
||||
return None
|
||||
level = len(stripped) - len(stripped.lstrip("#"))
|
||||
title = stripped[level:].strip()
|
||||
if not title:
|
||||
return None
|
||||
return level, title
|
||||
Reference in New Issue
Block a user