58 lines
1.8 KiB
Python
58 lines
1.8 KiB
Python
from __future__ import annotations
|
|
|
|
import ast
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class CodeChunk:
|
|
title: str
|
|
text: str
|
|
start_line: int
|
|
end_line: int
|
|
chunk_type: str
|
|
|
|
|
|
class CodeTextChunker:
|
|
def chunk(self, path: str, text: str) -> list[CodeChunk]:
|
|
try:
|
|
tree = ast.parse(text)
|
|
except SyntaxError:
|
|
return self._window_chunks(path, text)
|
|
chunks: list[CodeChunk] = []
|
|
lines = text.splitlines()
|
|
for node in tree.body:
|
|
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
continue
|
|
start = int(getattr(node, "lineno", 1))
|
|
end = int(getattr(node, "end_lineno", start))
|
|
body = "\n".join(lines[start - 1 : end]).strip()
|
|
if not body:
|
|
continue
|
|
chunks.append(
|
|
CodeChunk(
|
|
title=f"{path}:{getattr(node, 'name', 'block')}",
|
|
text=body,
|
|
start_line=start,
|
|
end_line=end,
|
|
chunk_type="symbol_block",
|
|
)
|
|
)
|
|
return chunks or self._window_chunks(path, text)
|
|
|
|
def _window_chunks(self, path: str, text: str) -> list[CodeChunk]:
|
|
lines = text.splitlines()
|
|
chunks: list[CodeChunk] = []
|
|
size = 80
|
|
overlap = 15
|
|
start = 0
|
|
while start < len(lines):
|
|
end = min(len(lines), start + size)
|
|
body = "\n".join(lines[start:end]).strip()
|
|
if body:
|
|
chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window"))
|
|
if end >= len(lines):
|
|
break
|
|
start = max(0, end - overlap)
|
|
return chunks
|