agent/app/modules/rag/intent_router_v2/query_plan_builder.py

from __future__ import annotations

from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper


class QueryPlanBuilder:
    _WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
    _NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
    def __init__(
        self,
        normalizer: QueryNormalizer | None = None,
        extractor: AnchorExtractor | None = None,
        mapper: RuEnTermMapper | None = None,
        keyword_hints: KeywordHintBuilder | None = None,
        keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
        carryover: ConversationAnchorBuilder | None = None,
        span_validator: AnchorSpanValidator | None = None,
        sub_intent_detector: SubIntentDetector | None = None,
        negation_detector: NegationDetector | None = None,
    ) -> None:
        self._normalizer = normalizer or QueryNormalizer()
        self._extractor = extractor or AnchorExtractor()
        self._mapper = mapper or RuEnTermMapper()
        self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
        self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
        self._carryover = carryover or ConversationAnchorBuilder()
        self._span_validator = span_validator or AnchorSpanValidator()
        self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
        self._negation_detector = negation_detector or NegationDetector()

    def build(
        self,
        user_query: str,
        conversation_state: ConversationState,
        continue_mode: bool,
        *,
        conversation_mode: str = "START",
        intent: str = "PROJECT_MISC",
    ) -> QueryPlan:
        raw = user_query or ""
        normalized = self._normalizer.normalize(raw)
        if not normalized and raw.strip():
            normalized = raw
        negations = self._negation_detector.detect(normalized)
        user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
        has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
        sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
        merged_anchors = self._merge_anchors(
            raw,
            user_anchors,
            conversation_state,
            continue_mode,
            conversation_mode=conversation_mode,
            intent=intent,
        )
        skip_tests = "tests" in negations or is_negative_test_request(raw)
        cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
        sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
        if intent == "DOCS_QA":
            sub_intent = "EXPLAIN"
        expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
        keyword_hints = self._keyword_hints(
            raw,
            normalized,
            cleaned_anchors,
            skip_tests=skip_tests,
            intent=intent,
            state=conversation_state,
        )
        return QueryPlan(
            raw=raw,
            normalized=normalized,
            sub_intent=sub_intent,
            negations=sorted(negations),
            expansions=expansions,
            keyword_hints=keyword_hints,
            anchors=cleaned_anchors,
        )

    def _merge_anchors(
        self,
        raw: str,
        anchors: list[QueryAnchor],
        state: ConversationState,
        continue_mode: bool,
        *,
        conversation_mode: str,
        intent: str,
    ) -> list[QueryAnchor]:
        has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
        has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
        inherited = self._carryover.build(
            raw,
            state,
            continue_mode=continue_mode,
            has_user_symbol=has_user_symbol,
            has_user_file_path=has_user_file,
        )
        if (
            conversation_mode == "SWITCH"
            and intent == "DOCS_QA"
            and not has_user_file
            and not has_user_symbol
            and state.active_symbol
        ):
            inherited.append(
                QueryAnchor(
                    type="SYMBOL",
                    value=state.active_symbol,
                    source="conversation_state",
                    span=None,
                    confidence=0.62,
                )
            )
        return self._dedupe(anchors + inherited)

    def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
        values = self._mapper.expand(normalized)
        has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
        if has_symbol:
            values = [value for value in values if value.lower() not in {"def", "class"}]
        if not skip_tests and has_test_focus(normalized):
            for candidate in ("test", "unit test"):
                if candidate not in values:
                    values.append(candidate)
        for anchor in anchors:
            if anchor.type == "SYMBOL" and anchor.value not in values:
                values.append(anchor.value)
        if skip_tests:
            values = [value for value in values if not is_test_related_token(value)]
        return values[:16]

    def _keyword_hints(
        self,
        raw: str,
        normalized: str,
        anchors: list[QueryAnchor],
        *,
        skip_tests: bool,
        intent: str,
        state: ConversationState,
    ) -> list[str]:
        values = self._keyword_hints_builder.build(normalized)
        for anchor in anchors:
            if anchor.type not in {"FILE_PATH", "SYMBOL"}:
                continue
            candidate = anchor.value
            if candidate not in values:
                values.append(candidate)
        if skip_tests:
            values = [value for value in values if not is_test_related_token(value)]
        sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
        if intent == "DOCS_QA" and not sanitized:
            fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
            sanitized = fallback[:3]
            if state.active_symbol and state.active_symbol not in sanitized:
                sanitized.append(state.active_symbol)
            sanitized = sanitized[:5]
        return sanitized

    def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
        if not skip_tests:
            return anchors
        result: list[QueryAnchor] = []
        for anchor in anchors:
            if anchor.type not in {"KEY_TERM", "SYMBOL"}:
                result.append(anchor)
                continue
            if is_test_related_token(anchor.value):
                continue
            result.append(anchor)
        return result

    def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
        result: list[QueryAnchor] = []
        seen: set[tuple[str, str, str | None, str]] = set()
        for anchor in anchors:
            key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
            if key in seen:
                continue
            seen.add(key)
            result.append(anchor)
        return result

    def _resolve_sub_intent(
        self,
        candidate: str,
        raw: str,
        anchors: list[QueryAnchor],
        *,
        intent: str,
        negations: set[str],
    ) -> str:
        if candidate != "EXPLAIN":
            return candidate
        if intent != "CODE_QA":
            return candidate
        text = " ".join((raw or "").lower().split())
        has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
        has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
        has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
        is_why = any(marker in text for marker in self._WHY_MARKERS)
        is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
        is_short_generic = len(text.split()) <= 4 and text.endswith("?")
        if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
            return "EXPLAIN_LOCAL"
        if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
            return "EXPLAIN_LOCAL"
        return candidate

    def _looks_like_file(self, value: str) -> bool:
        tail = (value or "").rsplit("/", 1)[-1]
        return "." in tail