78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class SectionChunk:
|
|
section_path: str
|
|
section_title: str
|
|
content: str
|
|
order: int
|
|
|
|
|
|
class MarkdownDocChunker:
|
|
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
|
|
self._fallback = text_chunker or DocTextChunker()
|
|
|
|
def chunk(self, text: str) -> list[SectionChunk]:
|
|
lines = text.splitlines()
|
|
sections: list[SectionChunk] = []
|
|
stack: list[tuple[int, str]] = []
|
|
current_title = "Document"
|
|
current_lines: list[str] = []
|
|
order = 0
|
|
for line in lines:
|
|
heading = self._heading(line)
|
|
if heading is None:
|
|
current_lines.append(line)
|
|
continue
|
|
self._flush_section(sections, stack, current_title, current_lines, order)
|
|
order += 1
|
|
level, title = heading
|
|
stack = [item for item in stack if item[0] < level]
|
|
stack.append((level, title))
|
|
current_title = title
|
|
current_lines = []
|
|
self._flush_section(sections, stack, current_title, current_lines, order)
|
|
if sections:
|
|
return sections
|
|
chunks = self._fallback.split(text)
|
|
return [
|
|
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
|
|
for index, chunk in enumerate(chunks)
|
|
]
|
|
|
|
def _flush_section(
|
|
self,
|
|
sections: list[SectionChunk],
|
|
stack: list[tuple[int, str]],
|
|
current_title: str,
|
|
current_lines: list[str],
|
|
order: int,
|
|
) -> None:
|
|
content = "\n".join(current_lines).strip()
|
|
if not content:
|
|
return
|
|
titles = [title for _, title in stack] or [current_title]
|
|
sections.append(
|
|
SectionChunk(
|
|
section_path=" > ".join(titles),
|
|
section_title=titles[-1],
|
|
content=content,
|
|
order=order,
|
|
)
|
|
)
|
|
|
|
def _heading(self, line: str) -> tuple[int, str] | None:
|
|
stripped = line.strip()
|
|
if not stripped.startswith("#"):
|
|
return None
|
|
level = len(stripped) - len(stripped.lstrip("#"))
|
|
title = stripped[level:].strip()
|
|
if not title:
|
|
return None
|
|
return level, title
|