224 lines
9.5 KiB
Python
224 lines
9.5 KiB
Python
from __future__ import annotations
|
|
|
|
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
|
|
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
|
|
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
|
|
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
|
|
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
|
|
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
|
|
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
|
|
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
|
|
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
|
|
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
|
|
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
|
|
|
|
|
|
class QueryPlanBuilder:
|
|
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
|
|
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
|
|
def __init__(
|
|
self,
|
|
normalizer: QueryNormalizer | None = None,
|
|
extractor: AnchorExtractor | None = None,
|
|
mapper: RuEnTermMapper | None = None,
|
|
keyword_hints: KeywordHintBuilder | None = None,
|
|
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
|
|
carryover: ConversationAnchorBuilder | None = None,
|
|
span_validator: AnchorSpanValidator | None = None,
|
|
sub_intent_detector: SubIntentDetector | None = None,
|
|
negation_detector: NegationDetector | None = None,
|
|
) -> None:
|
|
self._normalizer = normalizer or QueryNormalizer()
|
|
self._extractor = extractor or AnchorExtractor()
|
|
self._mapper = mapper or RuEnTermMapper()
|
|
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
|
|
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
|
|
self._carryover = carryover or ConversationAnchorBuilder()
|
|
self._span_validator = span_validator or AnchorSpanValidator()
|
|
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
|
|
self._negation_detector = negation_detector or NegationDetector()
|
|
|
|
def build(
|
|
self,
|
|
user_query: str,
|
|
conversation_state: ConversationState,
|
|
continue_mode: bool,
|
|
*,
|
|
conversation_mode: str = "START",
|
|
intent: str = "PROJECT_MISC",
|
|
) -> QueryPlan:
|
|
raw = user_query or ""
|
|
normalized = self._normalizer.normalize(raw)
|
|
if not normalized and raw.strip():
|
|
normalized = raw
|
|
negations = self._negation_detector.detect(normalized)
|
|
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
|
|
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
|
|
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
|
|
merged_anchors = self._merge_anchors(
|
|
raw,
|
|
user_anchors,
|
|
conversation_state,
|
|
continue_mode,
|
|
conversation_mode=conversation_mode,
|
|
intent=intent,
|
|
)
|
|
skip_tests = "tests" in negations or is_negative_test_request(raw)
|
|
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
|
|
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
|
|
if intent == "DOCS_QA":
|
|
sub_intent = "EXPLAIN"
|
|
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
|
|
keyword_hints = self._keyword_hints(
|
|
raw,
|
|
normalized,
|
|
cleaned_anchors,
|
|
skip_tests=skip_tests,
|
|
intent=intent,
|
|
state=conversation_state,
|
|
)
|
|
return QueryPlan(
|
|
raw=raw,
|
|
normalized=normalized,
|
|
sub_intent=sub_intent,
|
|
negations=sorted(negations),
|
|
expansions=expansions,
|
|
keyword_hints=keyword_hints,
|
|
anchors=cleaned_anchors,
|
|
)
|
|
|
|
def _merge_anchors(
|
|
self,
|
|
raw: str,
|
|
anchors: list[QueryAnchor],
|
|
state: ConversationState,
|
|
continue_mode: bool,
|
|
*,
|
|
conversation_mode: str,
|
|
intent: str,
|
|
) -> list[QueryAnchor]:
|
|
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
|
|
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
|
|
inherited = self._carryover.build(
|
|
raw,
|
|
state,
|
|
continue_mode=continue_mode,
|
|
has_user_symbol=has_user_symbol,
|
|
has_user_file_path=has_user_file,
|
|
)
|
|
if (
|
|
conversation_mode == "SWITCH"
|
|
and intent == "DOCS_QA"
|
|
and not has_user_file
|
|
and not has_user_symbol
|
|
and state.active_symbol
|
|
):
|
|
inherited.append(
|
|
QueryAnchor(
|
|
type="SYMBOL",
|
|
value=state.active_symbol,
|
|
source="conversation_state",
|
|
span=None,
|
|
confidence=0.62,
|
|
)
|
|
)
|
|
return self._dedupe(anchors + inherited)
|
|
|
|
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
|
|
values = self._mapper.expand(normalized)
|
|
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
|
|
if has_symbol:
|
|
values = [value for value in values if value.lower() not in {"def", "class"}]
|
|
if not skip_tests and has_test_focus(normalized):
|
|
for candidate in ("test", "unit test"):
|
|
if candidate not in values:
|
|
values.append(candidate)
|
|
for anchor in anchors:
|
|
if anchor.type == "SYMBOL" and anchor.value not in values:
|
|
values.append(anchor.value)
|
|
if skip_tests:
|
|
values = [value for value in values if not is_test_related_token(value)]
|
|
return values[:16]
|
|
|
|
def _keyword_hints(
|
|
self,
|
|
raw: str,
|
|
normalized: str,
|
|
anchors: list[QueryAnchor],
|
|
*,
|
|
skip_tests: bool,
|
|
intent: str,
|
|
state: ConversationState,
|
|
) -> list[str]:
|
|
values = self._keyword_hints_builder.build(normalized)
|
|
for anchor in anchors:
|
|
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
|
|
continue
|
|
candidate = anchor.value
|
|
if candidate not in values:
|
|
values.append(candidate)
|
|
if skip_tests:
|
|
values = [value for value in values if not is_test_related_token(value)]
|
|
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
|
|
if intent == "DOCS_QA" and not sanitized:
|
|
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
|
|
sanitized = fallback[:3]
|
|
if state.active_symbol and state.active_symbol not in sanitized:
|
|
sanitized.append(state.active_symbol)
|
|
sanitized = sanitized[:5]
|
|
return sanitized
|
|
|
|
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
|
if not skip_tests:
|
|
return anchors
|
|
result: list[QueryAnchor] = []
|
|
for anchor in anchors:
|
|
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
|
|
result.append(anchor)
|
|
continue
|
|
if is_test_related_token(anchor.value):
|
|
continue
|
|
result.append(anchor)
|
|
return result
|
|
|
|
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
|
result: list[QueryAnchor] = []
|
|
seen: set[tuple[str, str, str | None, str]] = set()
|
|
for anchor in anchors:
|
|
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
result.append(anchor)
|
|
return result
|
|
|
|
def _resolve_sub_intent(
|
|
self,
|
|
candidate: str,
|
|
raw: str,
|
|
anchors: list[QueryAnchor],
|
|
*,
|
|
intent: str,
|
|
negations: set[str],
|
|
) -> str:
|
|
if candidate != "EXPLAIN":
|
|
return candidate
|
|
if intent != "CODE_QA":
|
|
return candidate
|
|
text = " ".join((raw or "").lower().split())
|
|
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
|
|
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
|
|
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
|
|
is_why = any(marker in text for marker in self._WHY_MARKERS)
|
|
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
|
|
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
|
|
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
|
|
return "EXPLAIN_LOCAL"
|
|
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
|
|
return "EXPLAIN_LOCAL"
|
|
return candidate
|
|
|
|
def _looks_like_file(self, value: str) -> bool:
|
|
tail = (value or "").rsplit("/", 1)[-1]
|
|
return "." in tail
|