Files
agent/app/modules/rag/indexing/code/code_text/chunker.py
2026-03-01 14:21:33 +03:00

58 lines
1.8 KiB
Python

from __future__ import annotations
import ast
from dataclasses import dataclass
@dataclass(slots=True)
class CodeChunk:
title: str
text: str
start_line: int
end_line: int
chunk_type: str
class CodeTextChunker:
def chunk(self, path: str, text: str) -> list[CodeChunk]:
try:
tree = ast.parse(text)
except SyntaxError:
return self._window_chunks(path, text)
chunks: list[CodeChunk] = []
lines = text.splitlines()
for node in tree.body:
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
continue
start = int(getattr(node, "lineno", 1))
end = int(getattr(node, "end_lineno", start))
body = "\n".join(lines[start - 1 : end]).strip()
if not body:
continue
chunks.append(
CodeChunk(
title=f"{path}:{getattr(node, 'name', 'block')}",
text=body,
start_line=start,
end_line=end,
chunk_type="symbol_block",
)
)
return chunks or self._window_chunks(path, text)
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
lines = text.splitlines()
chunks: list[CodeChunk] = []
size = 80
overlap = 15
start = 0
while start < len(lines):
end = min(len(lines), start + size)
body = "\n".join(lines[start:end]).strip()
if body:
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
if end >= len(lines):
break
start = max(0, end - overlap)
return chunks