21 lines
690 B
Python
21 lines
690 B
Python
class TextChunker:
|
|
def __init__(self, chunk_size: int = 900, overlap: int = 120) -> None:
|
|
self._chunk_size = chunk_size
|
|
self._overlap = overlap
|
|
|
|
def chunk(self, text: str) -> list[str]:
|
|
cleaned = text.replace("\r\n", "\n")
|
|
if not cleaned.strip():
|
|
return []
|
|
chunks: list[str] = []
|
|
start = 0
|
|
while start < len(cleaned):
|
|
end = min(len(cleaned), start + self._chunk_size)
|
|
piece = cleaned[start:end].strip()
|
|
if piece:
|
|
chunks.append(piece)
|
|
if end == len(cleaned):
|
|
break
|
|
start = max(0, end - self._overlap)
|
|
return chunks
|