первый коммит
This commit is contained in:
0
app/modules/rag_session/retrieval/__init__.py
Normal file
0
app/modules/rag_session/retrieval/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
20
app/modules/rag_session/retrieval/chunker.py
Normal file
20
app/modules/rag_session/retrieval/chunker.py
Normal file
@@ -0,0 +1,20 @@
|
||||
class TextChunker:
|
||||
def __init__(self, chunk_size: int = 900, overlap: int = 120) -> None:
|
||||
self._chunk_size = chunk_size
|
||||
self._overlap = overlap
|
||||
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
cleaned = text.replace("\r\n", "\n")
|
||||
if not cleaned.strip():
|
||||
return []
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(len(cleaned), start + self._chunk_size)
|
||||
piece = cleaned[start:end].strip()
|
||||
if piece:
|
||||
chunks.append(piece)
|
||||
if end == len(cleaned):
|
||||
break
|
||||
start = max(0, end - self._overlap)
|
||||
return chunks
|
||||
12
app/modules/rag_session/retrieval/scoring.py
Normal file
12
app/modules/rag_session/retrieval/scoring.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import math
|
||||
|
||||
|
||||
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||
if not a or not b or len(a) != len(b):
|
||||
return -1.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
norm_a = math.sqrt(sum(x * x for x in a))
|
||||
norm_b = math.sqrt(sum(y * y for y in b))
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return -1.0
|
||||
return dot / (norm_a * norm_b)
|
||||
Reference in New Issue
Block a user