Новый раг
This commit is contained in:
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
57
app/modules/rag/indexing/code/code_text/chunker.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CodeChunk:
|
||||
title: str
|
||||
text: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
chunk_type: str
|
||||
|
||||
|
||||
class CodeTextChunker:
|
||||
def chunk(self, path: str, text: str) -> list[CodeChunk]:
|
||||
try:
|
||||
tree = ast.parse(text)
|
||||
except SyntaxError:
|
||||
return self._window_chunks(path, text)
|
||||
chunks: list[CodeChunk] = []
|
||||
lines = text.splitlines()
|
||||
for node in tree.body:
|
||||
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
continue
|
||||
start = int(getattr(node, "lineno", 1))
|
||||
end = int(getattr(node, "end_lineno", start))
|
||||
body = "\n".join(lines[start - 1 : end]).strip()
|
||||
if not body:
|
||||
continue
|
||||
chunks.append(
|
||||
CodeChunk(
|
||||
title=f"{path}:{getattr(node, 'name', 'block')}",
|
||||
text=body,
|
||||
start_line=start,
|
||||
end_line=end,
|
||||
chunk_type="symbol_block",
|
||||
)
|
||||
)
|
||||
return chunks or self._window_chunks(path, text)
|
||||
|
||||
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
|
||||
lines = text.splitlines()
|
||||
chunks: list[CodeChunk] = []
|
||||
size = 80
|
||||
overlap = 15
|
||||
start = 0
|
||||
while start < len(lines):
|
||||
end = min(len(lines), start + size)
|
||||
body = "\n".join(lines[start:end]).strip()
|
||||
if body:
|
||||
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
|
||||
if end >= len(lines):
|
||||
break
|
||||
start = max(0, end - overlap)
|
||||
return chunks
|
||||
Reference in New Issue
Block a user