первый коммит

This commit is contained in:
2026-02-27 21:28:09 +03:00
parent 43c404f958
commit 1bc57a7c25
171 changed files with 6400 additions and 556 deletions

View File

@@ -0,0 +1,20 @@
class TextChunker:
def __init__(self, chunk_size: int = 900, overlap: int = 120) -> None:
self._chunk_size = chunk_size
self._overlap = overlap
def chunk(self, text: str) -> list[str]:
cleaned = text.replace("\r\n", "\n")
if not cleaned.strip():
return []
chunks: list[str] = []
start = 0
while start < len(cleaned):
end = min(len(cleaned), start + self._chunk_size)
piece = cleaned[start:end].strip()
if piece:
chunks.append(piece)
if end == len(cleaned):
break
start = max(0, end - self._overlap)
return chunks

View File

@@ -0,0 +1,12 @@
import math
def cosine_similarity(a: list[float], b: list[float]) -> float:
if not a or not b or len(a) != len(b):
return -1.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(y * y for y in b))
if norm_a == 0 or norm_b == 0:
return -1.0
return dot / (norm_a * norm_b)