22 lines
707 B
Python
22 lines
707 B
Python
from __future__ import annotations
|
|
|
|
|
|
class DocTextChunker:
|
|
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
|
|
self._max_chars = max_chars
|
|
self._overlap_chars = overlap_chars
|
|
|
|
def split(self, text: str) -> list[str]:
|
|
cleaned = text.strip()
|
|
if not cleaned:
|
|
return []
|
|
chunks: list[str] = []
|
|
start = 0
|
|
while start < len(cleaned):
|
|
end = min(len(cleaned), start + self._max_chars)
|
|
chunks.append(cleaned[start:end].strip())
|
|
if end >= len(cleaned):
|
|
break
|
|
start = max(0, end - self._overlap_chars)
|
|
return [chunk for chunk in chunks if chunk]
|