Files
agent/app/modules/rag/intent_router_v2/query_plan_builder.py
T

224 lines
9.5 KiB
Python

from __future__ import annotations
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
class QueryPlanBuilder:
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
def __init__(
self,
normalizer: QueryNormalizer | None = None,
extractor: AnchorExtractor | None = None,
mapper: RuEnTermMapper | None = None,
keyword_hints: KeywordHintBuilder | None = None,
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
carryover: ConversationAnchorBuilder | None = None,
span_validator: AnchorSpanValidator | None = None,
sub_intent_detector: SubIntentDetector | None = None,
negation_detector: NegationDetector | None = None,
) -> None:
self._normalizer = normalizer or QueryNormalizer()
self._extractor = extractor or AnchorExtractor()
self._mapper = mapper or RuEnTermMapper()
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
self._carryover = carryover or ConversationAnchorBuilder()
self._span_validator = span_validator or AnchorSpanValidator()
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
self._negation_detector = negation_detector or NegationDetector()
def build(
self,
user_query: str,
conversation_state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str = "START",
intent: str = "PROJECT_MISC",
) -> QueryPlan:
raw = user_query or ""
normalized = self._normalizer.normalize(raw)
if not normalized and raw.strip():
normalized = raw
negations = self._negation_detector.detect(normalized)
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
merged_anchors = self._merge_anchors(
raw,
user_anchors,
conversation_state,
continue_mode,
conversation_mode=conversation_mode,
intent=intent,
)
skip_tests = "tests" in negations or is_negative_test_request(raw)
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
if intent == "DOCS_QA":
sub_intent = "EXPLAIN"
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
keyword_hints = self._keyword_hints(
raw,
normalized,
cleaned_anchors,
skip_tests=skip_tests,
intent=intent,
state=conversation_state,
)
return QueryPlan(
raw=raw,
normalized=normalized,
sub_intent=sub_intent,
negations=sorted(negations),
expansions=expansions,
keyword_hints=keyword_hints,
anchors=cleaned_anchors,
)
def _merge_anchors(
self,
raw: str,
anchors: list[QueryAnchor],
state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str,
intent: str,
) -> list[QueryAnchor]:
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
inherited = self._carryover.build(
raw,
state,
continue_mode=continue_mode,
has_user_symbol=has_user_symbol,
has_user_file_path=has_user_file,
)
if (
conversation_mode == "SWITCH"
and intent == "DOCS_QA"
and not has_user_file
and not has_user_symbol
and state.active_symbol
):
inherited.append(
QueryAnchor(
type="SYMBOL",
value=state.active_symbol,
source="conversation_state",
span=None,
confidence=0.62,
)
)
return self._dedupe(anchors + inherited)
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
values = self._mapper.expand(normalized)
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
if has_symbol:
values = [value for value in values if value.lower() not in {"def", "class"}]
if not skip_tests and has_test_focus(normalized):
for candidate in ("test", "unit test"):
if candidate not in values:
values.append(candidate)
for anchor in anchors:
if anchor.type == "SYMBOL" and anchor.value not in values:
values.append(anchor.value)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
return values[:16]
def _keyword_hints(
self,
raw: str,
normalized: str,
anchors: list[QueryAnchor],
*,
skip_tests: bool,
intent: str,
state: ConversationState,
) -> list[str]:
values = self._keyword_hints_builder.build(normalized)
for anchor in anchors:
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
continue
candidate = anchor.value
if candidate not in values:
values.append(candidate)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
if intent == "DOCS_QA" and not sanitized:
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
sanitized = fallback[:3]
if state.active_symbol and state.active_symbol not in sanitized:
sanitized.append(state.active_symbol)
sanitized = sanitized[:5]
return sanitized
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
if not skip_tests:
return anchors
result: list[QueryAnchor] = []
for anchor in anchors:
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
result.append(anchor)
continue
if is_test_related_token(anchor.value):
continue
result.append(anchor)
return result
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
seen: set[tuple[str, str, str | None, str]] = set()
for anchor in anchors:
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
if key in seen:
continue
seen.add(key)
result.append(anchor)
return result
def _resolve_sub_intent(
self,
candidate: str,
raw: str,
anchors: list[QueryAnchor],
*,
intent: str,
negations: set[str],
) -> str:
if candidate != "EXPLAIN":
return candidate
if intent != "CODE_QA":
return candidate
text = " ".join((raw or "").lower().split())
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
is_why = any(marker in text for marker in self._WHY_MARKERS)
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
return "EXPLAIN_LOCAL"
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
return "EXPLAIN_LOCAL"
return candidate
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail