from __future__ import annotations from dataclasses import dataclass from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker @dataclass(slots=True) class SectionChunk: section_path: str section_title: str content: str order: int class MarkdownDocChunker: def __init__(self, text_chunker: DocTextChunker | None = None) -> None: self._fallback = text_chunker or DocTextChunker() def chunk(self, text: str) -> list[SectionChunk]: lines = text.splitlines() sections: list[SectionChunk] = [] stack: list[tuple[int, str]] = [] current_title = "Document" current_lines: list[str] = [] order = 0 for line in lines: heading = self._heading(line) if heading is None: current_lines.append(line) continue self._flush_section(sections, stack, current_title, current_lines, order) order += 1 level, title = heading stack = [item for item in stack if item[0] < level] stack.append((level, title)) current_title = title current_lines = [] self._flush_section(sections, stack, current_title, current_lines, order) if sections: return sections chunks = self._fallback.split(text) return [ SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index) for index, chunk in enumerate(chunks) ] def _flush_section( self, sections: list[SectionChunk], stack: list[tuple[int, str]], current_title: str, current_lines: list[str], order: int, ) -> None: content = "\n".join(current_lines).strip() if not content: return titles = [title for _, title in stack] or [current_title] sections.append( SectionChunk( section_path=" > ".join(titles), section_title=titles[-1], content=content, order=order, ) ) def _heading(self, line: str) -> tuple[int, str] | None: stripped = line.strip() if not stripped.startswith("#"): return None level = len(stripped) - len(stripped.lstrip("#")) title = stripped[level:].strip() if not title: return None return level, title