agent/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py

from __future__ import annotations

from dataclasses import dataclass

from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker


@dataclass(slots=True)
class SectionChunk:
    section_path: str
    section_title: str
    content: str
    order: int


class MarkdownDocChunker:
    def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
        self._fallback = text_chunker or DocTextChunker()

    def chunk(self, text: str) -> list[SectionChunk]:
        lines = text.splitlines()
        sections: list[SectionChunk] = []
        stack: list[tuple[int, str]] = []
        current_title = "Document"
        current_lines: list[str] = []
        order = 0
        for line in lines:
            heading = self._heading(line)
            if heading is None:
                current_lines.append(line)
                continue
            self._flush_section(sections, stack, current_title, current_lines, order)
            order += 1
            level, title = heading
            stack = [item for item in stack if item[0] < level]
            stack.append((level, title))
            current_title = title
            current_lines = []
        self._flush_section(sections, stack, current_title, current_lines, order)
        if sections:
            return sections
        chunks = self._fallback.split(text)
        return [
            SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
            for index, chunk in enumerate(chunks)
        ]

    def _flush_section(
        self,
        sections: list[SectionChunk],
        stack: list[tuple[int, str]],
        current_title: str,
        current_lines: list[str],
        order: int,
    ) -> None:
        content = "\n".join(current_lines).strip()
        if not content:
            return
        titles = [title for _, title in stack] or [current_title]
        sections.append(
            SectionChunk(
                section_path=" > ".join(titles),
                section_title=titles[-1],
                content=content,
                order=order,
            )
        )

    def _heading(self, line: str) -> tuple[int, str] | None:
        stripped = line.strip()
        if not stripped.startswith("#"):
            return None
        level = len(stripped) - len(stripped.lstrip("#"))
        title = stripped[level:].strip()
        if not title:
            return None
        return level, title