Новый раг
This commit is contained in:
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class DocTextChunker:
|
||||
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
|
||||
self._max_chars = max_chars
|
||||
self._overlap_chars = overlap_chars
|
||||
|
||||
def split(self, text: str) -> list[str]:
|
||||
cleaned = text.strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(len(cleaned), start + self._max_chars)
|
||||
chunks.append(cleaned[start:end].strip())
|
||||
if end >= len(cleaned):
|
||||
break
|
||||
start = max(0, end - self._overlap_chars)
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
Reference in New Issue
Block a user