Фиксация изменений

This commit is contained in:
2026-03-05 11:03:17 +03:00
parent 1ef0b4d68c
commit 417b8b6f72
261 changed files with 8215 additions and 332 deletions

View File

@@ -0,0 +1,23 @@
from app.modules.rag.intent_router_v2.factory import GigaChatIntentRouterFactory
from app.modules.rag.intent_router_v2.local_runner import IntentRouterScenarioRunner
from app.modules.rag.intent_router_v2.models import (
ConversationState,
IntentDecision,
IntentRouterResult,
QueryAnchor,
QueryPlan,
RepoContext,
)
from app.modules.rag.intent_router_v2.router import IntentRouterV2
__all__ = [
"ConversationState",
"GigaChatIntentRouterFactory",
"IntentDecision",
"IntentRouterResult",
"IntentRouterScenarioRunner",
"IntentRouterV2",
"QueryAnchor",
"QueryPlan",
"RepoContext",
]

View File

@@ -0,0 +1,144 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.models import AnchorSpan, QueryAnchor
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
_FILE_PATTERN = re.compile(r"(?P<value>\b(?:[\w.-]+/)*[\w.-]+\.(?:py|md|rst|txt|yaml|yml|json|toml|ini|cfg)\b)")
_PATH_HINT_PATTERN = re.compile(r"(?P<value>\b(?:src|app|docs|tests)/[\w./-]*[\w-]\b)")
_SYMBOL_PATTERN = re.compile(
r"\b(?P<value>[A-Z][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b"
)
_DOC_SECTION_PATTERN = re.compile(r"(?:section|раздел)\s+[\"'`#]?(?P<value>[A-Za-zА-Яа-я0-9_ ./:-]{2,})", re.IGNORECASE)
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
class AnchorExtractor:
def __init__(
self,
mapper: RuEnTermMapper | None = None,
canonicalizer: KeyTermCanonicalizer | None = None,
) -> None:
self._mapper = mapper or RuEnTermMapper()
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
def extract(self, text: str) -> list[QueryAnchor]:
anchors = self._file_anchors(text)
anchors.extend(self._symbol_anchors(text, file_anchors=anchors))
anchors.extend(self._doc_ref_anchors(text))
anchors.extend(self._key_term_anchors(text))
return self._dedupe(anchors)
def _file_anchors(self, text: str) -> list[QueryAnchor]:
anchors = self._anchors_from_matches(_FILE_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.95)
anchors.extend(self._anchors_from_matches(_PATH_HINT_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.8))
return anchors
def _symbol_anchors(self, text: str, *, file_anchors: list[QueryAnchor]) -> list[QueryAnchor]:
anchors: list[QueryAnchor] = []
path_ranges = [(anchor.span.start, anchor.span.end) for anchor in file_anchors if anchor.span is not None]
path_segments = self._path_segments(file_anchors)
for match in _SYMBOL_PATTERN.finditer(text):
value = match.group("value")
if value.endswith((".py", ".md")) or "/" in value:
continue
if self._is_inside_path(match.start("value"), match.end("value"), path_ranges):
continue
if self._is_keyword(value):
continue
if file_anchors and value.lower() in path_segments:
continue
anchors.append(self._anchor("SYMBOL", value, match.start("value"), match.end("value"), 0.88, source="user_text"))
return anchors
def _doc_ref_anchors(self, text: str) -> list[QueryAnchor]:
anchors = self._anchors_from_matches(_DOC_SECTION_PATTERN.finditer(text), anchor_type="DOC_REF", confidence=0.75, subtype="section")
for match in _FILE_PATTERN.finditer(text):
value = match.group("value")
if not value.lower().endswith((".md", ".rst", ".txt")):
continue
anchors.append(self._anchor("DOC_REF", value, match.start("value"), match.end("value"), 0.92, subtype="file", source="user_text"))
return anchors
def _key_term_anchors(self, text: str) -> list[QueryAnchor]:
literals = set(self._mapper.all_literal_terms())
anchors: list[QueryAnchor] = []
for token in _WORD_RE.finditer(text):
value = token.group(0)
normalized = value.lower()
canonical = self._canonicalizer.canonicalize(value)
if canonical is None and normalized not in literals:
continue
anchors.append(
self._anchor(
"KEY_TERM",
canonical or value,
token.start(),
token.end(),
0.9,
source="user_text",
)
)
return anchors
def _anchors_from_matches(
self,
matches,
*,
anchor_type: str,
confidence: float,
subtype: str | None = None,
) -> list[QueryAnchor]:
return [
self._anchor(anchor_type, match.group("value"), match.start("value"), match.end("value"), confidence, subtype=subtype)
for match in matches
]
def _anchor(
self,
anchor_type: str,
value: str,
start: int,
end: int,
confidence: float,
subtype: str | None = None,
source: str = "user_text",
) -> QueryAnchor:
return QueryAnchor(
type=anchor_type,
value=value,
subtype=subtype,
source=source,
span=AnchorSpan(start=start, end=end),
confidence=confidence,
)
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
seen: set[tuple[str, str, str | None, str]] = set()
for anchor in anchors:
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
if key in seen:
continue
seen.add(key)
result.append(anchor)
return result
def _is_inside_path(self, start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
return any(start >= left and end <= right for left, right in ranges)
def _is_keyword(self, token: str) -> bool:
return token.lower() in PY_KEYWORDS
def _path_segments(self, anchors: list[QueryAnchor]) -> set[str]:
values: set[str] = set()
for anchor in anchors:
parts = re.split(r"[/.]+", anchor.value.lower())
for part in parts:
if not part:
continue
values.add(part)
return values | COMMON_PATH_SEGMENTS

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import QueryAnchor
class AnchorSpanValidator:
def sanitize(self, anchors: list[QueryAnchor], raw_len: int) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
for anchor in anchors:
if anchor.source != "user_text":
result.append(anchor.model_copy(update={"span": None}))
continue
if anchor.span is None:
result.append(anchor)
continue
start = int(anchor.span.start)
end = int(anchor.span.end)
if 0 <= start < end <= raw_len:
result.append(anchor)
continue
result.append(anchor.model_copy(update={"span": None, "confidence": max(anchor.confidence * 0.5, 0.0)}))
return result

View File

@@ -0,0 +1,113 @@
from __future__ import annotations
import json
import re
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
from app.modules.rag.intent_router_v2.protocols import TextGenerator
from app.modules.rag.intent_router_v2.test_signals import has_test_focus
_CODE_FILE_PATH_RE = re.compile(
r"\b(?:[\w.-]+/)*[\w.-]+\.(?:py|js|jsx|ts|tsx|java|kt|go|rb|php|c|cc|cpp|h|hpp|cs|swift|rs)(?!\w)\b",
re.IGNORECASE,
)
class IntentClassifierV2:
_GENERATE_DOCS_MARKERS = (
"сгенерируй документац",
"подготовь документац",
"создай документац",
"генерац",
"generate documentation",
"write documentation",
)
_DOCS_MARKERS = ("документац", "readme", "docs/", ".md", "spec", "runbook", "markdown")
_CODE_MARKERS = ("по коду", "код", "класс", "метод", "функц", "модул", "пакет", "файл", "block", "блок", "handler", "endpoint")
def __init__(self, llm: TextGenerator | None = None) -> None:
self._llm = llm
def classify(self, user_query: str, conversation_state: ConversationState) -> IntentDecision:
deterministic = self._deterministic(user_query)
if deterministic:
return deterministic
llm_decision = self._classify_with_llm(user_query, conversation_state)
if llm_decision:
return llm_decision
return IntentDecision(intent="PROJECT_MISC", confidence=0.55, reason="fallback_project_misc")
def _deterministic(self, user_query: str) -> IntentDecision | None:
text = " ".join((user_query or "").lower().split())
if any(marker in text for marker in self._GENERATE_DOCS_MARKERS):
return IntentDecision(intent="GENERATE_DOCS_FROM_CODE", confidence=0.97, reason="deterministic_generate_docs")
if self._looks_like_docs_question(text):
return IntentDecision(intent="DOCS_QA", confidence=0.9, reason="deterministic_docs")
if self._looks_like_code_question(user_query, text):
return IntentDecision(intent="CODE_QA", confidence=0.9, reason="deterministic_code")
return None
def _classify_with_llm(self, user_query: str, conversation_state: ConversationState) -> IntentDecision | None:
if self._llm is None:
return None
payload = json.dumps(
{
"message": user_query,
"active_intent": conversation_state.active_intent,
"last_query": conversation_state.last_query,
"allowed_intents": ["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"],
},
ensure_ascii=False,
)
try:
raw = self._llm.generate("rag_intent_router_v2", payload, log_context="rag.intent_router_v2.classify").strip()
except Exception:
return None
parsed = self._parse(raw)
if parsed is None:
return None
return parsed
def _parse(self, raw: str) -> IntentDecision | None:
candidate = self._strip_code_fence(raw)
try:
payload = json.loads(candidate)
except json.JSONDecodeError:
return None
intent = str(payload.get("intent") or "").strip().upper()
if intent not in {"CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"}:
return None
return IntentDecision(
intent=intent,
confidence=float(payload.get("confidence") or 0.7),
reason=str(payload.get("reason") or "llm").strip() or "llm",
)
def _strip_code_fence(self, text: str) -> str:
if not text.startswith("```"):
return text
lines = text.splitlines()
if len(lines) < 3 or lines[-1].strip() != "```":
return text
return "\n".join(lines[1:-1]).strip()
def _looks_like_docs_question(self, text: str) -> bool:
if self._has_code_file_path(text):
return False
return any(marker in text for marker in self._DOCS_MARKERS)
def _looks_like_code_question(self, raw_text: str, lowered: str) -> bool:
if self._has_code_file_path(raw_text):
return True
if has_test_focus(lowered):
return True
if any(marker in lowered for marker in self._DOCS_MARKERS) and not any(marker in lowered for marker in self._CODE_MARKERS):
return False
if any(marker in lowered for marker in self._CODE_MARKERS):
return True
if re.search(r"\b[A-Z][A-Za-z0-9_]{2,}(?:\.[A-Za-z_][A-Za-z0-9_]*)*\b", raw_text or ""):
return True
return bool(re.search(r"\b[a-z_][A-Za-z0-9_]{2,}\(", raw_text or ""))
def _has_code_file_path(self, text: str) -> bool:
return bool(_CODE_FILE_PATH_RE.search(text or ""))

View File

@@ -0,0 +1,61 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.followup_detector import FollowUpDetector
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor
class ConversationAnchorBuilder:
def __init__(self, followup: FollowUpDetector | None = None) -> None:
self._followup = followup or FollowUpDetector()
def build(
self,
raw: str,
state: ConversationState,
*,
continue_mode: bool,
has_user_symbol: bool,
has_user_file_path: bool,
) -> list[QueryAnchor]:
if not continue_mode:
return []
anchors: list[QueryAnchor] = []
if has_user_file_path:
return anchors
for path in self._paths_for_carryover(state.active_path_scope):
anchors.append(
QueryAnchor(
type="FILE_PATH",
value=path,
source="conversation_state",
span=None,
confidence=0.6,
)
)
if has_user_symbol:
return anchors
if not self._followup.is_follow_up(raw):
return anchors
symbol = state.active_symbol or (state.active_code_span_symbols[0] if state.active_code_span_symbols else None)
if symbol:
anchors.append(
QueryAnchor(
type="SYMBOL",
value=symbol,
source="conversation_state",
span=None,
confidence=0.64,
)
)
return anchors
def _paths_for_carryover(self, active_path_scope: list[str]) -> list[str]:
paths = list(active_path_scope or [])
file_paths = [path for path in paths if self._looks_like_file(path)]
if file_paths:
return file_paths[:1]
return paths[:1]
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
class ConversationPolicy:
_SWITCH_MARKERS = (
"теперь",
"а теперь",
"давай теперь",
"переключ",
"new task",
"switch to",
"instead",
)
_DOCS_SIGNALS = ("документац", "readme", "docs/", ".md")
_CODE_SIGNALS = ("по коду", "класс", "метод", "файл", "блок кода", "function", "class")
def resolve(self, decision: IntentDecision, user_query: str, conversation_state: ConversationState) -> tuple[str, str]:
active_intent = conversation_state.active_intent
if active_intent is None:
return decision.intent, "START"
if active_intent == decision.intent:
return active_intent, "CONTINUE"
if self._has_explicit_switch(user_query):
return decision.intent, "SWITCH"
if self._is_hard_mismatch(active_intent, decision.intent, user_query):
return decision.intent, "SWITCH"
return active_intent, "CONTINUE"
def _has_explicit_switch(self, user_query: str) -> bool:
text = " ".join((user_query or "").lower().split())
return any(marker in text for marker in self._SWITCH_MARKERS)
def _is_hard_mismatch(self, active_intent: str, candidate_intent: str, user_query: str) -> bool:
if active_intent == candidate_intent:
return False
text = " ".join((user_query or "").lower().split())
if candidate_intent == "GENERATE_DOCS_FROM_CODE":
return True
if candidate_intent == "DOCS_QA":
return any(signal in text for signal in self._DOCS_SIGNALS)
if candidate_intent == "CODE_QA" and active_intent == "DOCS_QA":
return any(signal in text for signal in self._CODE_SIGNALS)
return False

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import EvidencePolicy
class EvidencePolicyFactory:
def build(
self,
intent: str,
*,
sub_intent: str = "EXPLAIN",
negations: list[str] | None = None,
has_user_anchor: bool = True,
) -> EvidencePolicy:
negations_set = set(negations or [])
if intent == "CODE_QA":
if sub_intent == "OPEN_FILE":
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
if sub_intent == "EXPLAIN_LOCAL":
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
if "tests" in negations_set and not has_user_anchor:
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
return EvidencePolicy(require_def=True, require_flow=True, require_spec=False, allow_answer_without_evidence=False)
if intent == "DOCS_QA":
return EvidencePolicy(require_def=False, require_flow=False, require_spec=True, allow_answer_without_evidence=False)
if intent == "GENERATE_DOCS_FROM_CODE":
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=True)

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.agent.llm import AgentLlmService
from app.modules.agent.prompt_loader import PromptLoader
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
from app.modules.rag.intent_router_v2.router import IntentRouterV2
from app.modules.shared.env_loader import load_workspace_env
from app.modules.shared.gigachat.client import GigaChatClient
from app.modules.shared.gigachat.settings import GigaChatSettings
from app.modules.shared.gigachat.token_provider import GigaChatTokenProvider
class GigaChatIntentRouterFactory:
def build(self) -> IntentRouterV2:
load_workspace_env()
settings = GigaChatSettings.from_env()
token_provider = GigaChatTokenProvider(settings)
client = GigaChatClient(settings, token_provider)
prompt_loader = PromptLoader()
llm = AgentLlmService(client=client, prompts=prompt_loader)
classifier = IntentClassifierV2(llm=llm)
return IntentRouterV2(classifier=classifier)

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
class FollowUpDetector:
_MARKERS = (
"что дальше",
"почему",
"зачем",
"а что",
"уточни",
"подробнее",
"как именно",
"покажи подробнее",
)
def is_follow_up(self, raw: str) -> bool:
text = " ".join((raw or "").lower().split())
if not text:
return False
if len(text.split()) <= 4:
return True
return any(marker in text for marker in self._MARKERS)

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
class GraphIdResolver:
_GRAPH_MAP = {
"CODE_QA": "CodeQAGraph",
"DOCS_QA": "DocsQAGraph",
"GENERATE_DOCS_FROM_CODE": "GenerateDocsFromCodeGraph",
"PROJECT_MISC": "ProjectMiscGraph",
}
def resolve(self, intent: str) -> str:
return self._GRAPH_MAP[intent]

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.normalization import FILE_PATH_RE
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
_IDENTIFIER_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
class KeywordHintBuilder:
def build(self, text: str) -> list[str]:
hints: list[str] = []
path_segments = self._path_segments(text)
for token in _IDENTIFIER_RE.findall(text or ""):
if token.lower() in PY_KEYWORDS:
continue
if token.lower() in path_segments:
continue
if token not in hints:
hints.append(token)
for match in FILE_PATH_RE.finditer(text or ""):
candidate = match.group(0).lower()
if candidate not in hints:
hints.append(candidate)
return hints[:12]
def _path_segments(self, text: str) -> set[str]:
values: set[str] = set(COMMON_PATH_SEGMENTS)
for match in FILE_PATH_RE.finditer(text or ""):
for part in re.split(r"[/.]+", match.group(0).lower()):
if part:
values.add(part)
return values

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import QueryAnchor
class KeywordHintSanitizer:
_GENERIC_KEY_TERMS = {"файл", "класс", "метод", "функция", "документация"}
_DOMAIN_ALLOWLIST = {"RAG", "API", "HTTP", "SQL"}
_DIR_SCOPE_MARKERS = ("в папке", "в директории", "в каталоге")
def sanitize(self, raw: str, anchors: list[QueryAnchor], base_hints: list[str]) -> list[str]:
text = (raw or "").lower()
allow_dirs = any(marker in text for marker in self._DIR_SCOPE_MARKERS)
file_paths = [anchor.value for anchor in anchors if anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value)]
known_dirs = {path.rsplit("/", 1)[0] for path in file_paths if "/" in path}
result: list[str] = []
for anchor in anchors:
if anchor.type == "FILE_PATH":
if self._looks_like_directory(anchor.value):
if not allow_dirs and (known_dirs or file_paths):
continue
self._append(result, anchor.value)
if anchor.type == "SYMBOL":
self._append(result, anchor.value)
for token in base_hints:
if token in self._DOMAIN_ALLOWLIST:
self._append(result, token)
continue
lowered = token.lower()
if lowered in self._GENERIC_KEY_TERMS:
continue
if token in known_dirs and not allow_dirs:
continue
if "/" in token and "." not in token and not allow_dirs and file_paths:
continue
self._append(result, token)
return result[:8]
def _append(self, values: list[str], candidate: str) -> None:
if candidate and candidate not in values:
values.append(candidate)
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail
def _looks_like_directory(self, value: str) -> bool:
return "/" in (value or "") and not self._looks_like_file(value)

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import LayerQuery, RepoContext
class LayerQueryBuilder:
def build(self, intent: str, repo_context: RepoContext, *, domains: list[str], layers_map: dict[str, list[tuple[str, int]]]) -> list[LayerQuery]:
available = set(repo_context.available_layers or [])
result: list[LayerQuery] = []
for layer_id, top_k in layers_map[intent]:
if not self._layer_matches_domains(layer_id, domains):
continue
if available and layer_id not in available:
continue
result.append(LayerQuery(layer_id=layer_id, top_k=top_k))
if result:
return result
return [
LayerQuery(layer_id=layer_id, top_k=top_k)
for layer_id, top_k in layers_map[intent]
if self._layer_matches_domains(layer_id, domains)
]
def _layer_matches_domains(self, layer_id: str, domains: list[str]) -> bool:
if domains == ["CODE"]:
return layer_id.startswith("C")
if domains == ["DOCS"]:
return layer_id.startswith("D")
return layer_id.startswith("C") or layer_id.startswith("D")

View File

@@ -0,0 +1,25 @@
from __future__ import annotations
import logging
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
from app.modules.rag.intent_router_v2.router import IntentRouterV2
LOGGER = logging.getLogger(__name__)
class IntentRouterScenarioRunner:
def __init__(self, router: IntentRouterV2) -> None:
self._router = router
def run(self, queries: list[str], repo_context: RepoContext | None = None) -> list[IntentRouterResult]:
state = ConversationState()
context = repo_context or RepoContext()
results: list[IntentRouterResult] = []
for index, user_query in enumerate(queries, start=1):
LOGGER.warning("intent router local input: turn=%s user_query=%s", index, user_query)
result = self._router.route(user_query, state, context)
LOGGER.warning("intent router local output: turn=%s result=%s", index, result.model_dump_json(ensure_ascii=False))
results.append(result)
state = state.advance(result)
return results

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
import logging
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
LOGGER = logging.getLogger(__name__)
class IntentRouterLogger:
def log_request(self, user_query: str, conversation_state: ConversationState, repo_context: RepoContext) -> None:
LOGGER.warning(
"intent router v2 request: turn=%s active_intent=%s user_query=%s languages=%s domains=%s",
conversation_state.turn_index + 1,
conversation_state.active_intent,
" ".join((user_query or "").split()),
repo_context.languages,
repo_context.available_domains,
)
def log_result(self, result: IntentRouterResult) -> None:
LOGGER.warning("intent router v2 result: %s", result.model_dump_json(ensure_ascii=False))

View File

@@ -0,0 +1,182 @@
from __future__ import annotations
import re
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
IntentType = Literal["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"]
ConversationMode = Literal["START", "CONTINUE", "SWITCH"]
AnchorType = Literal["FILE_PATH", "SYMBOL", "DOC_REF", "KEY_TERM"]
AnchorSource = Literal["user_text", "conversation_state", "heuristic"]
_INLINE_CODE_RE = re.compile(r"`([^`]*)`")
_CODE_SYMBOL_RE = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b")
class AnchorSpan(BaseModel):
model_config = ConfigDict(extra="forbid")
start: int = 0
end: int = 0
class QueryAnchor(BaseModel):
model_config = ConfigDict(extra="forbid")
type: AnchorType
value: str
source: AnchorSource = "user_text"
subtype: str | None = None
span: AnchorSpan | None = None
confidence: float = 0.0
@field_validator("confidence")
@classmethod
def clamp_confidence(cls, value: float) -> float:
return max(0.0, min(1.0, float(value)))
class QueryPlan(BaseModel):
model_config = ConfigDict(extra="forbid")
raw: str
normalized: str
sub_intent: str = "EXPLAIN"
negations: list[str] = Field(default_factory=list)
expansions: list[str] = Field(default_factory=list)
keyword_hints: list[str] = Field(default_factory=list)
anchors: list[QueryAnchor] = Field(default_factory=list)
class LayerQuery(BaseModel):
model_config = ConfigDict(extra="forbid")
layer_id: str
top_k: int
class CodeRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
test_policy: str = "EXCLUDE"
path_scope: list[str] = Field(default_factory=list)
language: list[str] = Field(default_factory=list)
class DocsRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
path_scope: list[str] = Field(default_factory=list)
doc_kinds: list[str] = Field(default_factory=list)
doc_language: list[str] = Field(default_factory=list)
class HybridRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
test_policy: str = "EXCLUDE"
path_scope: list[str] = Field(default_factory=list)
language: list[str] = Field(default_factory=list)
doc_kinds: list[str] = Field(default_factory=list)
doc_language: list[str] = Field(default_factory=list)
class RetrievalSpec(BaseModel):
model_config = ConfigDict(extra="forbid")
domains: list[str] = Field(default_factory=list)
layer_queries: list[LayerQuery] = Field(default_factory=list)
filters: CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters = Field(default_factory=CodeRetrievalFilters)
rerank_profile: str = ""
class EvidencePolicy(BaseModel):
model_config = ConfigDict(extra="forbid")
require_def: bool = False
require_flow: bool = False
require_spec: bool = False
allow_answer_without_evidence: bool = False
class IntentRouterResult(BaseModel):
model_config = ConfigDict(extra="forbid")
schema_version: str = "1.1"
intent: IntentType
graph_id: str
conversation_mode: ConversationMode
query_plan: QueryPlan
retrieval_spec: RetrievalSpec
evidence_policy: EvidencePolicy
class ConversationState(BaseModel):
model_config = ConfigDict(extra="forbid")
active_intent: IntentType | None = None
active_domain: str | None = None
active_anchors: list[QueryAnchor] = Field(default_factory=list)
active_symbol: str | None = None
active_path_scope: list[str] = Field(default_factory=list)
active_code_span_symbols: list[str] = Field(default_factory=list)
last_query: str = ""
turn_index: int = 0
def advance(self, result: IntentRouterResult) -> "ConversationState":
user_anchors = [anchor for anchor in result.query_plan.anchors if anchor.source == "user_text"]
symbol_candidates = [anchor.value for anchor in user_anchors if anchor.type == "SYMBOL"]
has_user_file_anchor = any(anchor.type == "FILE_PATH" for anchor in user_anchors)
if symbol_candidates:
active_symbol = symbol_candidates[-1]
elif has_user_file_anchor:
active_symbol = None
else:
active_symbol = self.active_symbol
raw_code_symbols = _extract_code_symbols(result.query_plan.raw)
active_code_span_symbols = raw_code_symbols or list(self.active_code_span_symbols)
path_scope = list(getattr(result.retrieval_spec.filters, "path_scope", []) or [])
active_domains = list(result.retrieval_spec.domains or [])
active_domain = active_domains[0] if len(active_domains) == 1 else self.active_domain
return ConversationState(
active_intent=result.intent,
active_domain=active_domain,
active_anchors=list(user_anchors),
active_symbol=active_symbol,
active_path_scope=path_scope or list(self.active_path_scope),
active_code_span_symbols=active_code_span_symbols,
last_query=result.query_plan.raw,
turn_index=self.turn_index + 1,
)
class RepoContext(BaseModel):
model_config = ConfigDict(extra="forbid")
languages: list[str] = Field(default_factory=list)
available_domains: list[str] = Field(default_factory=lambda: ["CODE", "DOCS"])
available_layers: list[str] = Field(default_factory=list)
class IntentDecision(BaseModel):
model_config = ConfigDict(extra="forbid")
intent: IntentType
confidence: float = 0.0
reason: str = ""
@field_validator("confidence")
@classmethod
def clamp_confidence(cls, value: float) -> float:
return max(0.0, min(1.0, float(value)))
def _extract_code_symbols(raw: str) -> list[str]:
symbols: list[str] = []
for match in _INLINE_CODE_RE.finditer(raw or ""):
snippet = match.group(1)
for token in _CODE_SYMBOL_RE.findall(snippet):
if token not in symbols:
symbols.append(token)
return symbols[:8]

View File

@@ -0,0 +1,17 @@
from __future__ import annotations
import re
_TEST_NEG_RE = re.compile(
r"(?:не\s+про\s+тест|без\s+тест|кроме\s+тест|про\s+прод\s+код|только\s+прод|production\s+code)",
re.IGNORECASE,
)
class NegationDetector:
def detect(self, text: str) -> set[str]:
lowered = (text or "").lower()
negations: set[str] = set()
if _TEST_NEG_RE.search(lowered):
negations.add("tests")
return negations

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import re
CODE_SPAN_RE = re.compile(r"`[^`]*`")
FILE_PATH_RE = re.compile(
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
re.IGNORECASE,
)
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
WS_RE = re.compile(r"\s+")
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "": '"', "": '"', "": "'", "": "'"})
class QueryNormalizer:
"""Builds a deterministic canonical form from raw text without enrichment."""
def normalize(self, raw: str) -> str:
text = raw or ""
protected = _ProtectedText()
text = self._protect(text, protected)
text = self._collapse_whitespace(text)
text = text.translate(QUOTE_TRANSLATION)
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
text = self._collapse_whitespace(text)
return protected.restore(text)
def _protect(self, text: str, protected: "_ProtectedText") -> str:
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
text = pattern.sub(protected.replace, text)
return text
def _collapse_whitespace(self, text: str) -> str:
return WS_RE.sub(" ", text).strip()
class _ProtectedText:
def __init__(self) -> None:
self._items: dict[str, str] = {}
self._index = 0
def replace(self, match: re.Match[str]) -> str:
placeholder = f"@@P{self._index}@@"
self._items[placeholder] = match.group(0)
self._index += 1
return placeholder
def restore(self, text: str) -> str:
restored = text
for placeholder, value in self._items.items():
restored = restored.replace(placeholder, value)
return restored

View File

@@ -0,0 +1,48 @@
from __future__ import annotations
class KeyTermCanonicalizer:
_ALIASES: dict[str, set[str]] = {
"файл": {
"файл",
"файла",
"файле",
"файлу",
"файлом",
"файлы",
"файлов",
"файлам",
"файлами",
},
"класс": {"класс", "класса", "классе", "классу", "классом", "классы", "классов", "классам"},
"функция": {"функция", "функции", "функцию", "функцией", "функциях"},
"метод": {"метод", "метода", "методе", "методу", "методом", "методы"},
"документация": {"документация", "документации", "документацию"},
"тест": {"тест", "тесты", "тестов", "тестам", "тестами", "юнит-тест", "юниттест"},
"модуль": {"модуль", "модуля"},
"пакет": {"пакет"},
}
def __init__(self) -> None:
self._token_to_canonical = self._build_index()
def canonicalize(self, token: str) -> str | None:
return self._token_to_canonical.get((token or "").lower())
def aliases(self) -> set[str]:
values: set[str] = set()
for forms in self._ALIASES.values():
values.update(forms)
return values
def is_test_term(self, token: str) -> bool:
canonical = self.canonicalize(token)
return canonical == "тест"
def _build_index(self) -> dict[str, str]:
index: dict[str, str] = {}
for canonical, forms in self._ALIASES.items():
index[canonical] = canonical
for form in forms:
index[form] = canonical
return index

View File

@@ -0,0 +1,7 @@
from __future__ import annotations
from typing import Protocol
class TextGenerator(Protocol):
def generate(self, prompt_name: str, user_input: str, *, log_context: str | None = None) -> str: ...

View File

@@ -0,0 +1,3 @@
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
__all__ = ["QueryNormalizer"]

View File

@@ -0,0 +1,223 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
class QueryPlanBuilder:
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
def __init__(
self,
normalizer: QueryNormalizer | None = None,
extractor: AnchorExtractor | None = None,
mapper: RuEnTermMapper | None = None,
keyword_hints: KeywordHintBuilder | None = None,
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
carryover: ConversationAnchorBuilder | None = None,
span_validator: AnchorSpanValidator | None = None,
sub_intent_detector: SubIntentDetector | None = None,
negation_detector: NegationDetector | None = None,
) -> None:
self._normalizer = normalizer or QueryNormalizer()
self._extractor = extractor or AnchorExtractor()
self._mapper = mapper or RuEnTermMapper()
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
self._carryover = carryover or ConversationAnchorBuilder()
self._span_validator = span_validator or AnchorSpanValidator()
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
self._negation_detector = negation_detector or NegationDetector()
def build(
self,
user_query: str,
conversation_state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str = "START",
intent: str = "PROJECT_MISC",
) -> QueryPlan:
raw = user_query or ""
normalized = self._normalizer.normalize(raw)
if not normalized and raw.strip():
normalized = raw
negations = self._negation_detector.detect(normalized)
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
merged_anchors = self._merge_anchors(
raw,
user_anchors,
conversation_state,
continue_mode,
conversation_mode=conversation_mode,
intent=intent,
)
skip_tests = "tests" in negations or is_negative_test_request(raw)
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
if intent == "DOCS_QA":
sub_intent = "EXPLAIN"
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
keyword_hints = self._keyword_hints(
raw,
normalized,
cleaned_anchors,
skip_tests=skip_tests,
intent=intent,
state=conversation_state,
)
return QueryPlan(
raw=raw,
normalized=normalized,
sub_intent=sub_intent,
negations=sorted(negations),
expansions=expansions,
keyword_hints=keyword_hints,
anchors=cleaned_anchors,
)
def _merge_anchors(
self,
raw: str,
anchors: list[QueryAnchor],
state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str,
intent: str,
) -> list[QueryAnchor]:
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
inherited = self._carryover.build(
raw,
state,
continue_mode=continue_mode,
has_user_symbol=has_user_symbol,
has_user_file_path=has_user_file,
)
if (
conversation_mode == "SWITCH"
and intent == "DOCS_QA"
and not has_user_file
and not has_user_symbol
and state.active_symbol
):
inherited.append(
QueryAnchor(
type="SYMBOL",
value=state.active_symbol,
source="conversation_state",
span=None,
confidence=0.62,
)
)
return self._dedupe(anchors + inherited)
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
values = self._mapper.expand(normalized)
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
if has_symbol:
values = [value for value in values if value.lower() not in {"def", "class"}]
if not skip_tests and has_test_focus(normalized):
for candidate in ("test", "unit test"):
if candidate not in values:
values.append(candidate)
for anchor in anchors:
if anchor.type == "SYMBOL" and anchor.value not in values:
values.append(anchor.value)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
return values[:16]
def _keyword_hints(
self,
raw: str,
normalized: str,
anchors: list[QueryAnchor],
*,
skip_tests: bool,
intent: str,
state: ConversationState,
) -> list[str]:
values = self._keyword_hints_builder.build(normalized)
for anchor in anchors:
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
continue
candidate = anchor.value
if candidate not in values:
values.append(candidate)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
if intent == "DOCS_QA" and not sanitized:
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
sanitized = fallback[:3]
if state.active_symbol and state.active_symbol not in sanitized:
sanitized.append(state.active_symbol)
sanitized = sanitized[:5]
return sanitized
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
if not skip_tests:
return anchors
result: list[QueryAnchor] = []
for anchor in anchors:
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
result.append(anchor)
continue
if is_test_related_token(anchor.value):
continue
result.append(anchor)
return result
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
seen: set[tuple[str, str, str | None, str]] = set()
for anchor in anchors:
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
if key in seen:
continue
seen.add(key)
result.append(anchor)
return result
def _resolve_sub_intent(
self,
candidate: str,
raw: str,
anchors: list[QueryAnchor],
*,
intent: str,
negations: set[str],
) -> str:
if candidate != "EXPLAIN":
return candidate
if intent != "CODE_QA":
return candidate
text = " ".join((raw or "").lower().split())
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
is_why = any(marker in text for marker in self._WHY_MARKERS)
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
return "EXPLAIN_LOCAL"
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
return "EXPLAIN_LOCAL"
return candidate
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import (
CodeRetrievalFilters,
ConversationState,
DocsRetrievalFilters,
HybridRetrievalFilters,
QueryAnchor,
RepoContext,
)
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
class RetrievalFilterBuilder:
def build(
self,
domains: list[str],
anchors: list[QueryAnchor],
repo_context: RepoContext,
*,
raw_query: str,
conversation_state: ConversationState | None,
conversation_mode: str,
sub_intent: str = "EXPLAIN",
) -> CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters:
path_scope = self._path_scope(
anchors,
conversation_state=conversation_state,
conversation_mode=conversation_mode,
raw_query=raw_query,
sub_intent=sub_intent,
)
if domains == ["DOCS"]:
return DocsRetrievalFilters(
path_scope=path_scope,
doc_kinds=self._doc_kinds(anchors, raw_query),
doc_language=[],
)
if domains == ["CODE"]:
return CodeRetrievalFilters(
test_policy=self._test_policy(raw_query, anchors),
path_scope=path_scope,
language=list(repo_context.languages),
)
return HybridRetrievalFilters(
test_policy=self._test_policy(raw_query, anchors),
path_scope=path_scope,
language=list(repo_context.languages),
doc_kinds=self._doc_kinds(anchors, raw_query),
doc_language=[],
)
def _test_policy(self, raw_query: str, anchors: list[QueryAnchor]) -> str:
if is_negative_test_request(raw_query):
return "EXCLUDE"
if has_test_focus(raw_query):
return "INCLUDE"
has_test_keyterm = any(anchor.type == "KEY_TERM" and is_test_related_token(anchor.value) for anchor in anchors)
return "INCLUDE" if has_test_keyterm else "EXCLUDE"
def _path_scope(
self,
anchors: list[QueryAnchor],
*,
conversation_state: ConversationState | None,
conversation_mode: str,
raw_query: str,
sub_intent: str,
) -> list[str]:
values: list[str] = []
has_user_file_anchor = False
file_values: list[str] = []
for anchor in anchors:
if anchor.type != "FILE_PATH":
continue
if anchor.source == "user_text":
has_user_file_anchor = True
if anchor.value not in values:
values.append(anchor.value)
if self._looks_like_file_path(anchor.value) and anchor.value not in file_values:
file_values.append(anchor.value)
parent = anchor.value.rsplit("/", 1)[0] if "/" in anchor.value and self._looks_like_file_path(anchor.value) else ""
if parent and parent not in values:
values.append(parent)
if sub_intent in {"OPEN_FILE", "EXPLAIN_LOCAL"} and file_values and not self._is_explicit_directory_scope(raw_query):
return file_values[:6]
if has_user_file_anchor or conversation_mode != "CONTINUE":
return values[:6]
if values:
return values[:6]
inherited = list((conversation_state.active_path_scope if conversation_state else []) or [])
return inherited[:6]
def _doc_kinds(self, anchors: list[QueryAnchor], raw_query: str) -> list[str]:
text = (raw_query or "").lower()
kinds: list[str] = []
has_readme = "readme" in text or any(
anchor.type in {"DOC_REF", "FILE_PATH"} and anchor.value.lower().endswith("readme.md")
for anchor in anchors
)
if has_readme:
kinds.append("README")
return kinds
def _looks_like_file_path(self, value: str) -> bool:
filename = value.rsplit("/", 1)[-1]
return "." in filename
def _is_explicit_directory_scope(self, raw_query: str) -> bool:
text = (raw_query or "").lower()
return any(marker in text for marker in ("в папке", "в директории", "в каталоге"))

View File

@@ -0,0 +1,118 @@
from __future__ import annotations
from app.modules.rag.contracts.enums import RagLayer
from app.modules.rag.intent_router_v2.layer_query_builder import LayerQueryBuilder
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, RepoContext, RetrievalSpec
from app.modules.rag.intent_router_v2.retrieval_filter_builder import RetrievalFilterBuilder
class RetrievalSpecFactory:
_LAYERS = {
"CODE_QA": [
(RagLayer.CODE_ENTRYPOINTS, 6),
(RagLayer.CODE_SYMBOL_CATALOG, 8),
(RagLayer.CODE_DEPENDENCY_GRAPH, 6),
(RagLayer.CODE_SOURCE_CHUNKS, 8),
],
"DOCS_QA": [
(RagLayer.DOCS_MODULE_CATALOG, 5),
(RagLayer.DOCS_FACT_INDEX, 8),
(RagLayer.DOCS_SECTION_INDEX, 8),
(RagLayer.DOCS_POLICY_INDEX, 4),
],
"GENERATE_DOCS_FROM_CODE": [
(RagLayer.CODE_SYMBOL_CATALOG, 12),
(RagLayer.CODE_DEPENDENCY_GRAPH, 8),
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_ENTRYPOINTS, 6),
],
"PROJECT_MISC": [
(RagLayer.DOCS_MODULE_CATALOG, 4),
(RagLayer.DOCS_SECTION_INDEX, 6),
(RagLayer.CODE_SYMBOL_CATALOG, 4),
(RagLayer.CODE_SOURCE_CHUNKS, 4),
],
}
_DOMAINS = {
"CODE_QA": ["CODE"],
"DOCS_QA": ["DOCS"],
"GENERATE_DOCS_FROM_CODE": ["CODE"],
"PROJECT_MISC": ["CODE", "DOCS"],
}
_RERANK = {
"CODE_QA": "code",
"DOCS_QA": "docs",
"GENERATE_DOCS_FROM_CODE": "generate",
"PROJECT_MISC": "project",
}
_OPEN_FILE_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
]
_OPEN_FILE_WITH_SYMBOL_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_SYMBOL_CATALOG, 6),
]
_EXPLAIN_LOCAL_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_SYMBOL_CATALOG, 8),
(RagLayer.CODE_DEPENDENCY_GRAPH, 4),
]
def __init__(
self,
layer_builder: LayerQueryBuilder | None = None,
filter_builder: RetrievalFilterBuilder | None = None,
) -> None:
self._layer_builder = layer_builder or LayerQueryBuilder()
self._filter_builder = filter_builder or RetrievalFilterBuilder()
def build(
self,
intent: str,
anchors: list[QueryAnchor],
repo_context: RepoContext,
*,
raw_query: str = "",
conversation_state: ConversationState | None = None,
conversation_mode: str = "START",
sub_intent: str = "EXPLAIN",
) -> RetrievalSpec:
domains = self._domains(intent, repo_context)
layers_map = self._with_sub_intent_layers(intent, sub_intent, anchors)
layer_queries = self._layer_builder.build(intent, repo_context, domains=domains, layers_map=layers_map)
filters = self._filter_builder.build(
domains,
anchors,
repo_context,
raw_query=raw_query,
conversation_state=conversation_state,
conversation_mode=conversation_mode,
sub_intent=sub_intent,
)
return RetrievalSpec(
domains=domains,
layer_queries=layer_queries,
filters=filters,
rerank_profile=self._RERANK[intent],
)
def _domains(self, intent: str, repo_context: RepoContext) -> list[str]:
available = set(repo_context.available_domains or ["CODE", "DOCS"])
result = [domain for domain in self._DOMAINS[intent] if domain in available]
return result or list(self._DOMAINS[intent])
def _with_sub_intent_layers(
self,
intent: str,
sub_intent: str,
anchors: list[QueryAnchor],
) -> dict[str, list[tuple[str, int]]]:
if intent != "CODE_QA":
return self._LAYERS
layers_map = dict(self._LAYERS)
if sub_intent == "OPEN_FILE":
has_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
layers_map["CODE_QA"] = list(self._OPEN_FILE_WITH_SYMBOL_LAYERS if has_symbol else self._OPEN_FILE_LAYERS)
elif sub_intent == "EXPLAIN_LOCAL":
layers_map["CODE_QA"] = list(self._EXPLAIN_LOCAL_LAYERS)
return layers_map

View File

@@ -0,0 +1,72 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
from app.modules.rag.intent_router_v2.conversation_policy import ConversationPolicy
from app.modules.rag.intent_router_v2.evidence_policy_factory import EvidencePolicyFactory
from app.modules.rag.intent_router_v2.graph_id_resolver import GraphIdResolver
from app.modules.rag.intent_router_v2.logger import IntentRouterLogger
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
from app.modules.rag.intent_router_v2.query_plan_builder import QueryPlanBuilder
from app.modules.rag.intent_router_v2.retrieval_spec_factory import RetrievalSpecFactory
class IntentRouterV2:
def __init__(
self,
classifier: IntentClassifierV2 | None = None,
conversation_policy: ConversationPolicy | None = None,
query_plan_builder: QueryPlanBuilder | None = None,
retrieval_factory: RetrievalSpecFactory | None = None,
evidence_factory: EvidencePolicyFactory | None = None,
graph_resolver: GraphIdResolver | None = None,
logger: IntentRouterLogger | None = None,
) -> None:
self._classifier = classifier or IntentClassifierV2()
self._conversation_policy = conversation_policy or ConversationPolicy()
self._query_plan_builder = query_plan_builder or QueryPlanBuilder()
self._retrieval_factory = retrieval_factory or RetrievalSpecFactory()
self._evidence_factory = evidence_factory or EvidencePolicyFactory()
self._graph_resolver = graph_resolver or GraphIdResolver()
self._logger = logger or IntentRouterLogger()
def route(
self,
user_query: str,
conversation_state: ConversationState | None = None,
repo_context: RepoContext | None = None,
) -> IntentRouterResult:
state = conversation_state or ConversationState()
context = repo_context or RepoContext()
self._logger.log_request(user_query, state, context)
decision = self._classifier.classify(user_query, state)
intent, conversation_mode = self._conversation_policy.resolve(decision, user_query, state)
query_plan = self._query_plan_builder.build(
user_query,
state,
continue_mode=conversation_mode == "CONTINUE",
conversation_mode=conversation_mode,
intent=intent,
)
result = IntentRouterResult(
intent=intent,
graph_id=self._graph_resolver.resolve(intent),
conversation_mode=conversation_mode,
query_plan=query_plan,
retrieval_spec=self._retrieval_factory.build(
intent,
query_plan.anchors,
context,
raw_query=query_plan.raw,
conversation_state=state,
conversation_mode=conversation_mode,
sub_intent=query_plan.sub_intent,
),
evidence_policy=self._evidence_factory.build(
intent,
sub_intent=query_plan.sub_intent,
negations=query_plan.negations,
has_user_anchor=any(anchor.source == "user_text" for anchor in query_plan.anchors),
),
)
self._logger.log_result(result)
return result

View File

@@ -0,0 +1,23 @@
from __future__ import annotations
class SubIntentDetector:
_OPEN_VERBS = ("открой", "посмотри", "проверь", "уточни")
_EXPLAIN_MARKERS = ("объясни", "как работает", "почему", "что делает", "зачем", "логика", "флоу", "flow")
_TEST_MARKERS = ("тест", "pytest", "unit test", "юнит")
def detect(self, raw: str, *, has_file_path: bool, negations: set[str]) -> str:
text = " ".join((raw or "").lower().split())
if not text:
return "EXPLAIN"
if has_file_path and self._has_open_verb(text) and not self._has_explain_markers(text):
return "OPEN_FILE"
if "tests" not in negations and any(marker in text for marker in self._TEST_MARKERS):
return "FIND_TESTS"
return "EXPLAIN"
def _has_open_verb(self, text: str) -> bool:
return any(text.startswith(verb) or f" {verb} " in f" {text} " for verb in self._OPEN_VERBS)
def _has_explain_markers(self, text: str) -> bool:
return any(marker in text for marker in self._EXPLAIN_MARKERS)

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
PY_KEYWORDS: set[str] = {
"and",
"as",
"assert",
"async",
"await",
"break",
"class",
"continue",
"def",
"del",
"elif",
"else",
"except",
"false",
"finally",
"for",
"from",
"global",
"if",
"import",
"in",
"is",
"lambda",
"none",
"nonlocal",
"not",
"or",
"pass",
"raise",
"return",
"true",
"try",
"while",
"with",
"yield",
}
COMMON_PATH_SEGMENTS: set[str] = {
"app",
"src",
"docs",
"tests",
"module",
"modules",
"core",
"pkg",
"lib",
}

View File

@@ -0,0 +1,67 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
class RuEnTermMapper:
_CANONICAL_MAP = {
"класс": ["class"],
"метод": ["method"],
"функция": ["function", "def"],
"модуль": ["module"],
"пакет": ["package"],
"файл": ["file"],
"тест": ["test", "unit test"],
"документация": ["documentation", "docs"],
"readme": ["readme"],
}
_ENGLISH_SOURCES = {
"class": ["class"],
"method": ["method"],
"function": ["function", "def"],
"module": ["module"],
"package": ["package"],
"file": ["file"],
"test": ["test", "unit test"],
"tests": ["test", "unit test"],
"documentation": ["documentation", "docs"],
"docs": ["documentation", "docs"],
"readme": ["readme"],
"def": ["def"],
}
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
def expand(self, text: str) -> list[str]:
expansions: list[str] = []
lowered = (text or "").lower()
for token in _WORD_RE.findall(lowered):
canonical = self._canonicalizer.canonicalize(token) or token
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
if "unit test" in lowered or "unit tests" in lowered:
self._extend(expansions, self._ENGLISH_SOURCES["test"])
return expansions
def key_terms(self) -> tuple[str, ...]:
return tuple(self._CANONICAL_MAP.keys())
def all_literal_terms(self) -> tuple[str, ...]:
values = set(self._canonicalizer.aliases())
values.update(self._CANONICAL_MAP.keys())
values.update(self._ENGLISH_SOURCES.keys())
for targets in self._CANONICAL_MAP.values():
values.update(target.lower() for target in targets)
for targets in self._ENGLISH_SOURCES.values():
values.update(target.lower() for target in targets)
return tuple(sorted(values))
def _extend(self, result: list[str], values: list[str]) -> None:
for value in values:
if value not in result:
result.append(value)

View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import re
_NEGATIVE_TEST_RE = re.compile(r"\b(?:не|без|кроме)\b[^.?!]{0,28}\ест", re.IGNORECASE)
_NEGATIVE_TEST_MARKERS = ("не про тест", "без тест", "кроме тест", "про прод код", "только прод", "production code")
_POSITIVE_TEST_MARKERS = (
"тест",
"tests",
"pytest",
"unit test",
"unit tests",
"тестиру",
)
_TEST_TERMS = {"тест", "тесты", "test", "tests", "pytest", "unit", "unit test", "юнит-тест", "юниттест"}
def is_negative_test_request(text: str) -> bool:
lowered = (text or "").lower()
if _NEGATIVE_TEST_RE.search(lowered):
return True
return any(marker in lowered for marker in _NEGATIVE_TEST_MARKERS)
def has_test_focus(text: str) -> bool:
lowered = (text or "").lower()
if is_negative_test_request(lowered):
return False
return any(marker in lowered for marker in _POSITIVE_TEST_MARKERS)
def is_test_related_token(value: str) -> bool:
lowered = (value or "").lower().strip()
if not lowered:
return False
if lowered in _TEST_TERMS:
return True
if lowered.startswith("test"):
return True
return lowered.startswith("тест")