103 lines
3.8 KiB
Python
103 lines
3.8 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from app.modules.rag.explain.models import ExplainHints, ExplainIntent
|
|
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
|
|
|
|
|
class ExplainIntentBuilder:
|
|
_ROUTE_RE = re.compile(r"(/[A-Za-z0-9_./{}:-]+)")
|
|
_FILE_RE = re.compile(r"([A-Za-z0-9_./-]+\.py)")
|
|
_SYMBOL_RE = re.compile(r"\b([A-Z][A-Za-z0-9_]*\.[A-Za-z_][A-Za-z0-9_]*|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b")
|
|
_COMMAND_RE = re.compile(r"`([A-Za-z0-9:_-]+)`")
|
|
_TEST_KEYWORDS = (
|
|
"тест",
|
|
"tests",
|
|
"test ",
|
|
"unit-test",
|
|
"unit test",
|
|
"юнит-тест",
|
|
"pytest",
|
|
"spec",
|
|
"как покрыто тестами",
|
|
"как проверяется",
|
|
"how is it tested",
|
|
"how it's tested",
|
|
)
|
|
|
|
def build(self, user_query: str) -> ExplainIntent:
|
|
normalized = " ".join((user_query or "").split())
|
|
lowered = normalized.lower()
|
|
keywords = self._keywords(normalized)
|
|
hints = ExplainHints(
|
|
paths=self._dedupe(self._FILE_RE.findall(normalized)),
|
|
symbols=self._symbols(normalized),
|
|
endpoints=self._dedupe(self._ROUTE_RE.findall(normalized)),
|
|
commands=self._commands(normalized, lowered),
|
|
)
|
|
return ExplainIntent(
|
|
raw_query=user_query,
|
|
normalized_query=normalized,
|
|
keywords=keywords[:12],
|
|
hints=hints,
|
|
include_tests=self._include_tests(lowered),
|
|
expected_entry_types=self._entry_types(lowered, hints),
|
|
depth=self._depth(lowered),
|
|
)
|
|
|
|
def _keywords(self, text: str) -> list[str]:
|
|
keywords = extract_query_terms(text)
|
|
for token in self._symbols(text):
|
|
if token not in keywords:
|
|
keywords.append(token)
|
|
for token in self._ROUTE_RE.findall(text):
|
|
if token not in keywords:
|
|
keywords.append(token)
|
|
return self._dedupe(keywords)
|
|
|
|
def _symbols(self, text: str) -> list[str]:
|
|
values = []
|
|
for raw in self._SYMBOL_RE.findall(text):
|
|
token = raw.strip()
|
|
if len(token) < 3:
|
|
continue
|
|
if token.endswith(".py"):
|
|
continue
|
|
values.append(token)
|
|
return self._dedupe(values)
|
|
|
|
def _commands(self, text: str, lowered: str) -> list[str]:
|
|
values = list(self._COMMAND_RE.findall(text))
|
|
if " command " in f" {lowered} ":
|
|
values.extend(re.findall(r"command\s+([A-Za-z0-9:_-]+)", lowered))
|
|
if " cli " in f" {lowered} ":
|
|
values.extend(re.findall(r"cli\s+([A-Za-z0-9:_-]+)", lowered))
|
|
return self._dedupe(values)
|
|
|
|
def _entry_types(self, lowered: str, hints: ExplainHints) -> list[str]:
|
|
if hints.endpoints or any(token in lowered for token in ("endpoint", "route", "handler", "http", "api")):
|
|
return ["http"]
|
|
if hints.commands or any(token in lowered for token in ("cli", "command", "click", "typer")):
|
|
return ["cli"]
|
|
return ["http", "cli"]
|
|
|
|
def _depth(self, lowered: str) -> str:
|
|
if any(token in lowered for token in ("deep", "подроб", "деталь", "full flow", "trace")):
|
|
return "deep"
|
|
if any(token in lowered for token in ("high level", "overview", "кратко", "summary")):
|
|
return "high"
|
|
return "medium"
|
|
|
|
def _include_tests(self, lowered: str) -> bool:
|
|
normalized = f" {lowered} "
|
|
return any(token in normalized for token in self._TEST_KEYWORDS)
|
|
|
|
def _dedupe(self, values: list[str]) -> list[str]:
|
|
result: list[str] = []
|
|
for value in values:
|
|
item = value.strip()
|
|
if item and item not in result:
|
|
result.append(item)
|
|
return result
|