Фиксация изменений
This commit is contained in:
23
app/modules/rag/intent_router_v2/__init__.py
Normal file
23
app/modules/rag/intent_router_v2/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from app.modules.rag.intent_router_v2.factory import GigaChatIntentRouterFactory
|
||||
from app.modules.rag.intent_router_v2.local_runner import IntentRouterScenarioRunner
|
||||
from app.modules.rag.intent_router_v2.models import (
|
||||
ConversationState,
|
||||
IntentDecision,
|
||||
IntentRouterResult,
|
||||
QueryAnchor,
|
||||
QueryPlan,
|
||||
RepoContext,
|
||||
)
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
|
||||
__all__ = [
|
||||
"ConversationState",
|
||||
"GigaChatIntentRouterFactory",
|
||||
"IntentDecision",
|
||||
"IntentRouterResult",
|
||||
"IntentRouterScenarioRunner",
|
||||
"IntentRouterV2",
|
||||
"QueryAnchor",
|
||||
"QueryPlan",
|
||||
"RepoContext",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
144
app/modules/rag/intent_router_v2/anchor_extractor.py
Normal file
144
app/modules/rag/intent_router_v2/anchor_extractor.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import AnchorSpan, QueryAnchor
|
||||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||||
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
|
||||
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
|
||||
|
||||
_FILE_PATTERN = re.compile(r"(?P<value>\b(?:[\w.-]+/)*[\w.-]+\.(?:py|md|rst|txt|yaml|yml|json|toml|ini|cfg)\b)")
|
||||
_PATH_HINT_PATTERN = re.compile(r"(?P<value>\b(?:src|app|docs|tests)/[\w./-]*[\w-]\b)")
|
||||
_SYMBOL_PATTERN = re.compile(
|
||||
r"\b(?P<value>[A-Z][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b"
|
||||
)
|
||||
_DOC_SECTION_PATTERN = re.compile(r"(?:section|раздел)\s+[\"'`#]?(?P<value>[A-Za-zА-Яа-я0-9_ ./:-]{2,})", re.IGNORECASE)
|
||||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||||
|
||||
|
||||
class AnchorExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
mapper: RuEnTermMapper | None = None,
|
||||
canonicalizer: KeyTermCanonicalizer | None = None,
|
||||
) -> None:
|
||||
self._mapper = mapper or RuEnTermMapper()
|
||||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||||
|
||||
def extract(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._file_anchors(text)
|
||||
anchors.extend(self._symbol_anchors(text, file_anchors=anchors))
|
||||
anchors.extend(self._doc_ref_anchors(text))
|
||||
anchors.extend(self._key_term_anchors(text))
|
||||
return self._dedupe(anchors)
|
||||
|
||||
def _file_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._anchors_from_matches(_FILE_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.95)
|
||||
anchors.extend(self._anchors_from_matches(_PATH_HINT_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.8))
|
||||
return anchors
|
||||
|
||||
def _symbol_anchors(self, text: str, *, file_anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
anchors: list[QueryAnchor] = []
|
||||
path_ranges = [(anchor.span.start, anchor.span.end) for anchor in file_anchors if anchor.span is not None]
|
||||
path_segments = self._path_segments(file_anchors)
|
||||
for match in _SYMBOL_PATTERN.finditer(text):
|
||||
value = match.group("value")
|
||||
if value.endswith((".py", ".md")) or "/" in value:
|
||||
continue
|
||||
if self._is_inside_path(match.start("value"), match.end("value"), path_ranges):
|
||||
continue
|
||||
if self._is_keyword(value):
|
||||
continue
|
||||
if file_anchors and value.lower() in path_segments:
|
||||
continue
|
||||
anchors.append(self._anchor("SYMBOL", value, match.start("value"), match.end("value"), 0.88, source="user_text"))
|
||||
return anchors
|
||||
|
||||
def _doc_ref_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._anchors_from_matches(_DOC_SECTION_PATTERN.finditer(text), anchor_type="DOC_REF", confidence=0.75, subtype="section")
|
||||
for match in _FILE_PATTERN.finditer(text):
|
||||
value = match.group("value")
|
||||
if not value.lower().endswith((".md", ".rst", ".txt")):
|
||||
continue
|
||||
anchors.append(self._anchor("DOC_REF", value, match.start("value"), match.end("value"), 0.92, subtype="file", source="user_text"))
|
||||
return anchors
|
||||
|
||||
def _key_term_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
literals = set(self._mapper.all_literal_terms())
|
||||
anchors: list[QueryAnchor] = []
|
||||
for token in _WORD_RE.finditer(text):
|
||||
value = token.group(0)
|
||||
normalized = value.lower()
|
||||
canonical = self._canonicalizer.canonicalize(value)
|
||||
if canonical is None and normalized not in literals:
|
||||
continue
|
||||
anchors.append(
|
||||
self._anchor(
|
||||
"KEY_TERM",
|
||||
canonical or value,
|
||||
token.start(),
|
||||
token.end(),
|
||||
0.9,
|
||||
source="user_text",
|
||||
)
|
||||
)
|
||||
return anchors
|
||||
|
||||
def _anchors_from_matches(
|
||||
self,
|
||||
matches,
|
||||
*,
|
||||
anchor_type: str,
|
||||
confidence: float,
|
||||
subtype: str | None = None,
|
||||
) -> list[QueryAnchor]:
|
||||
return [
|
||||
self._anchor(anchor_type, match.group("value"), match.start("value"), match.end("value"), confidence, subtype=subtype)
|
||||
for match in matches
|
||||
]
|
||||
|
||||
def _anchor(
|
||||
self,
|
||||
anchor_type: str,
|
||||
value: str,
|
||||
start: int,
|
||||
end: int,
|
||||
confidence: float,
|
||||
subtype: str | None = None,
|
||||
source: str = "user_text",
|
||||
) -> QueryAnchor:
|
||||
return QueryAnchor(
|
||||
type=anchor_type,
|
||||
value=value,
|
||||
subtype=subtype,
|
||||
source=source,
|
||||
span=AnchorSpan(start=start, end=end),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
seen: set[tuple[str, str, str | None, str]] = set()
|
||||
for anchor in anchors:
|
||||
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _is_inside_path(self, start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
|
||||
return any(start >= left and end <= right for left, right in ranges)
|
||||
|
||||
def _is_keyword(self, token: str) -> bool:
|
||||
return token.lower() in PY_KEYWORDS
|
||||
|
||||
def _path_segments(self, anchors: list[QueryAnchor]) -> set[str]:
|
||||
values: set[str] = set()
|
||||
for anchor in anchors:
|
||||
parts = re.split(r"[/.]+", anchor.value.lower())
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
values.add(part)
|
||||
return values | COMMON_PATH_SEGMENTS
|
||||
22
app/modules/rag/intent_router_v2/anchor_span_validator.py
Normal file
22
app/modules/rag/intent_router_v2/anchor_span_validator.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import QueryAnchor
|
||||
|
||||
|
||||
class AnchorSpanValidator:
|
||||
def sanitize(self, anchors: list[QueryAnchor], raw_len: int) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
for anchor in anchors:
|
||||
if anchor.source != "user_text":
|
||||
result.append(anchor.model_copy(update={"span": None}))
|
||||
continue
|
||||
if anchor.span is None:
|
||||
result.append(anchor)
|
||||
continue
|
||||
start = int(anchor.span.start)
|
||||
end = int(anchor.span.end)
|
||||
if 0 <= start < end <= raw_len:
|
||||
result.append(anchor)
|
||||
continue
|
||||
result.append(anchor.model_copy(update={"span": None, "confidence": max(anchor.confidence * 0.5, 0.0)}))
|
||||
return result
|
||||
113
app/modules/rag/intent_router_v2/classifier.py
Normal file
113
app/modules/rag/intent_router_v2/classifier.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
|
||||
from app.modules.rag.intent_router_v2.protocols import TextGenerator
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus
|
||||
|
||||
_CODE_FILE_PATH_RE = re.compile(
|
||||
r"\b(?:[\w.-]+/)*[\w.-]+\.(?:py|js|jsx|ts|tsx|java|kt|go|rb|php|c|cc|cpp|h|hpp|cs|swift|rs)(?!\w)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class IntentClassifierV2:
|
||||
_GENERATE_DOCS_MARKERS = (
|
||||
"сгенерируй документац",
|
||||
"подготовь документац",
|
||||
"создай документац",
|
||||
"генерац",
|
||||
"generate documentation",
|
||||
"write documentation",
|
||||
)
|
||||
_DOCS_MARKERS = ("документац", "readme", "docs/", ".md", "spec", "runbook", "markdown")
|
||||
_CODE_MARKERS = ("по коду", "код", "класс", "метод", "функц", "модул", "пакет", "файл", "block", "блок", "handler", "endpoint")
|
||||
|
||||
def __init__(self, llm: TextGenerator | None = None) -> None:
|
||||
self._llm = llm
|
||||
|
||||
def classify(self, user_query: str, conversation_state: ConversationState) -> IntentDecision:
|
||||
deterministic = self._deterministic(user_query)
|
||||
if deterministic:
|
||||
return deterministic
|
||||
llm_decision = self._classify_with_llm(user_query, conversation_state)
|
||||
if llm_decision:
|
||||
return llm_decision
|
||||
return IntentDecision(intent="PROJECT_MISC", confidence=0.55, reason="fallback_project_misc")
|
||||
|
||||
def _deterministic(self, user_query: str) -> IntentDecision | None:
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
if any(marker in text for marker in self._GENERATE_DOCS_MARKERS):
|
||||
return IntentDecision(intent="GENERATE_DOCS_FROM_CODE", confidence=0.97, reason="deterministic_generate_docs")
|
||||
if self._looks_like_docs_question(text):
|
||||
return IntentDecision(intent="DOCS_QA", confidence=0.9, reason="deterministic_docs")
|
||||
if self._looks_like_code_question(user_query, text):
|
||||
return IntentDecision(intent="CODE_QA", confidence=0.9, reason="deterministic_code")
|
||||
return None
|
||||
|
||||
def _classify_with_llm(self, user_query: str, conversation_state: ConversationState) -> IntentDecision | None:
|
||||
if self._llm is None:
|
||||
return None
|
||||
payload = json.dumps(
|
||||
{
|
||||
"message": user_query,
|
||||
"active_intent": conversation_state.active_intent,
|
||||
"last_query": conversation_state.last_query,
|
||||
"allowed_intents": ["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
try:
|
||||
raw = self._llm.generate("rag_intent_router_v2", payload, log_context="rag.intent_router_v2.classify").strip()
|
||||
except Exception:
|
||||
return None
|
||||
parsed = self._parse(raw)
|
||||
if parsed is None:
|
||||
return None
|
||||
return parsed
|
||||
|
||||
def _parse(self, raw: str) -> IntentDecision | None:
|
||||
candidate = self._strip_code_fence(raw)
|
||||
try:
|
||||
payload = json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
intent = str(payload.get("intent") or "").strip().upper()
|
||||
if intent not in {"CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"}:
|
||||
return None
|
||||
return IntentDecision(
|
||||
intent=intent,
|
||||
confidence=float(payload.get("confidence") or 0.7),
|
||||
reason=str(payload.get("reason") or "llm").strip() or "llm",
|
||||
)
|
||||
|
||||
def _strip_code_fence(self, text: str) -> str:
|
||||
if not text.startswith("```"):
|
||||
return text
|
||||
lines = text.splitlines()
|
||||
if len(lines) < 3 or lines[-1].strip() != "```":
|
||||
return text
|
||||
return "\n".join(lines[1:-1]).strip()
|
||||
|
||||
def _looks_like_docs_question(self, text: str) -> bool:
|
||||
if self._has_code_file_path(text):
|
||||
return False
|
||||
return any(marker in text for marker in self._DOCS_MARKERS)
|
||||
|
||||
def _looks_like_code_question(self, raw_text: str, lowered: str) -> bool:
|
||||
if self._has_code_file_path(raw_text):
|
||||
return True
|
||||
if has_test_focus(lowered):
|
||||
return True
|
||||
if any(marker in lowered for marker in self._DOCS_MARKERS) and not any(marker in lowered for marker in self._CODE_MARKERS):
|
||||
return False
|
||||
if any(marker in lowered for marker in self._CODE_MARKERS):
|
||||
return True
|
||||
if re.search(r"\b[A-Z][A-Za-z0-9_]{2,}(?:\.[A-Za-z_][A-Za-z0-9_]*)*\b", raw_text or ""):
|
||||
return True
|
||||
return bool(re.search(r"\b[a-z_][A-Za-z0-9_]{2,}\(", raw_text or ""))
|
||||
|
||||
def _has_code_file_path(self, text: str) -> bool:
|
||||
return bool(_CODE_FILE_PATH_RE.search(text or ""))
|
||||
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.followup_detector import FollowUpDetector
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor
|
||||
|
||||
|
||||
class ConversationAnchorBuilder:
|
||||
def __init__(self, followup: FollowUpDetector | None = None) -> None:
|
||||
self._followup = followup or FollowUpDetector()
|
||||
|
||||
def build(
|
||||
self,
|
||||
raw: str,
|
||||
state: ConversationState,
|
||||
*,
|
||||
continue_mode: bool,
|
||||
has_user_symbol: bool,
|
||||
has_user_file_path: bool,
|
||||
) -> list[QueryAnchor]:
|
||||
if not continue_mode:
|
||||
return []
|
||||
anchors: list[QueryAnchor] = []
|
||||
if has_user_file_path:
|
||||
return anchors
|
||||
for path in self._paths_for_carryover(state.active_path_scope):
|
||||
anchors.append(
|
||||
QueryAnchor(
|
||||
type="FILE_PATH",
|
||||
value=path,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.6,
|
||||
)
|
||||
)
|
||||
if has_user_symbol:
|
||||
return anchors
|
||||
if not self._followup.is_follow_up(raw):
|
||||
return anchors
|
||||
symbol = state.active_symbol or (state.active_code_span_symbols[0] if state.active_code_span_symbols else None)
|
||||
if symbol:
|
||||
anchors.append(
|
||||
QueryAnchor(
|
||||
type="SYMBOL",
|
||||
value=symbol,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.64,
|
||||
)
|
||||
)
|
||||
return anchors
|
||||
|
||||
def _paths_for_carryover(self, active_path_scope: list[str]) -> list[str]:
|
||||
paths = list(active_path_scope or [])
|
||||
file_paths = [path for path in paths if self._looks_like_file(path)]
|
||||
if file_paths:
|
||||
return file_paths[:1]
|
||||
return paths[:1]
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
45
app/modules/rag/intent_router_v2/conversation_policy.py
Normal file
45
app/modules/rag/intent_router_v2/conversation_policy.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
|
||||
|
||||
|
||||
class ConversationPolicy:
|
||||
_SWITCH_MARKERS = (
|
||||
"теперь",
|
||||
"а теперь",
|
||||
"давай теперь",
|
||||
"переключ",
|
||||
"new task",
|
||||
"switch to",
|
||||
"instead",
|
||||
)
|
||||
_DOCS_SIGNALS = ("документац", "readme", "docs/", ".md")
|
||||
_CODE_SIGNALS = ("по коду", "класс", "метод", "файл", "блок кода", "function", "class")
|
||||
|
||||
def resolve(self, decision: IntentDecision, user_query: str, conversation_state: ConversationState) -> tuple[str, str]:
|
||||
active_intent = conversation_state.active_intent
|
||||
if active_intent is None:
|
||||
return decision.intent, "START"
|
||||
if active_intent == decision.intent:
|
||||
return active_intent, "CONTINUE"
|
||||
if self._has_explicit_switch(user_query):
|
||||
return decision.intent, "SWITCH"
|
||||
if self._is_hard_mismatch(active_intent, decision.intent, user_query):
|
||||
return decision.intent, "SWITCH"
|
||||
return active_intent, "CONTINUE"
|
||||
|
||||
def _has_explicit_switch(self, user_query: str) -> bool:
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
return any(marker in text for marker in self._SWITCH_MARKERS)
|
||||
|
||||
def _is_hard_mismatch(self, active_intent: str, candidate_intent: str, user_query: str) -> bool:
|
||||
if active_intent == candidate_intent:
|
||||
return False
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
if candidate_intent == "GENERATE_DOCS_FROM_CODE":
|
||||
return True
|
||||
if candidate_intent == "DOCS_QA":
|
||||
return any(signal in text for signal in self._DOCS_SIGNALS)
|
||||
if candidate_intent == "CODE_QA" and active_intent == "DOCS_QA":
|
||||
return any(signal in text for signal in self._CODE_SIGNALS)
|
||||
return False
|
||||
28
app/modules/rag/intent_router_v2/evidence_policy_factory.py
Normal file
28
app/modules/rag/intent_router_v2/evidence_policy_factory.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import EvidencePolicy
|
||||
|
||||
|
||||
class EvidencePolicyFactory:
|
||||
def build(
|
||||
self,
|
||||
intent: str,
|
||||
*,
|
||||
sub_intent: str = "EXPLAIN",
|
||||
negations: list[str] | None = None,
|
||||
has_user_anchor: bool = True,
|
||||
) -> EvidencePolicy:
|
||||
negations_set = set(negations or [])
|
||||
if intent == "CODE_QA":
|
||||
if sub_intent == "OPEN_FILE":
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
if sub_intent == "EXPLAIN_LOCAL":
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
if "tests" in negations_set and not has_user_anchor:
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
return EvidencePolicy(require_def=True, require_flow=True, require_spec=False, allow_answer_without_evidence=False)
|
||||
if intent == "DOCS_QA":
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=True, allow_answer_without_evidence=False)
|
||||
if intent == "GENERATE_DOCS_FROM_CODE":
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=True)
|
||||
22
app/modules/rag/intent_router_v2/factory.py
Normal file
22
app/modules/rag/intent_router_v2/factory.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.agent.llm import AgentLlmService
|
||||
from app.modules.agent.prompt_loader import PromptLoader
|
||||
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
from app.modules.shared.env_loader import load_workspace_env
|
||||
from app.modules.shared.gigachat.client import GigaChatClient
|
||||
from app.modules.shared.gigachat.settings import GigaChatSettings
|
||||
from app.modules.shared.gigachat.token_provider import GigaChatTokenProvider
|
||||
|
||||
|
||||
class GigaChatIntentRouterFactory:
|
||||
def build(self) -> IntentRouterV2:
|
||||
load_workspace_env()
|
||||
settings = GigaChatSettings.from_env()
|
||||
token_provider = GigaChatTokenProvider(settings)
|
||||
client = GigaChatClient(settings, token_provider)
|
||||
prompt_loader = PromptLoader()
|
||||
llm = AgentLlmService(client=client, prompts=prompt_loader)
|
||||
classifier = IntentClassifierV2(llm=llm)
|
||||
return IntentRouterV2(classifier=classifier)
|
||||
22
app/modules/rag/intent_router_v2/followup_detector.py
Normal file
22
app/modules/rag/intent_router_v2/followup_detector.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class FollowUpDetector:
|
||||
_MARKERS = (
|
||||
"что дальше",
|
||||
"почему",
|
||||
"зачем",
|
||||
"а что",
|
||||
"уточни",
|
||||
"подробнее",
|
||||
"как именно",
|
||||
"покажи подробнее",
|
||||
)
|
||||
|
||||
def is_follow_up(self, raw: str) -> bool:
|
||||
text = " ".join((raw or "").lower().split())
|
||||
if not text:
|
||||
return False
|
||||
if len(text.split()) <= 4:
|
||||
return True
|
||||
return any(marker in text for marker in self._MARKERS)
|
||||
13
app/modules/rag/intent_router_v2/graph_id_resolver.py
Normal file
13
app/modules/rag/intent_router_v2/graph_id_resolver.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class GraphIdResolver:
|
||||
_GRAPH_MAP = {
|
||||
"CODE_QA": "CodeQAGraph",
|
||||
"DOCS_QA": "DocsQAGraph",
|
||||
"GENERATE_DOCS_FROM_CODE": "GenerateDocsFromCodeGraph",
|
||||
"PROJECT_MISC": "ProjectMiscGraph",
|
||||
}
|
||||
|
||||
def resolve(self, intent: str) -> str:
|
||||
return self._GRAPH_MAP[intent]
|
||||
34
app/modules/rag/intent_router_v2/keyword_hint_builder.py
Normal file
34
app/modules/rag/intent_router_v2/keyword_hint_builder.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.normalization import FILE_PATH_RE
|
||||
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
|
||||
|
||||
_IDENTIFIER_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
|
||||
|
||||
|
||||
class KeywordHintBuilder:
|
||||
def build(self, text: str) -> list[str]:
|
||||
hints: list[str] = []
|
||||
path_segments = self._path_segments(text)
|
||||
for token in _IDENTIFIER_RE.findall(text or ""):
|
||||
if token.lower() in PY_KEYWORDS:
|
||||
continue
|
||||
if token.lower() in path_segments:
|
||||
continue
|
||||
if token not in hints:
|
||||
hints.append(token)
|
||||
for match in FILE_PATH_RE.finditer(text or ""):
|
||||
candidate = match.group(0).lower()
|
||||
if candidate not in hints:
|
||||
hints.append(candidate)
|
||||
return hints[:12]
|
||||
|
||||
def _path_segments(self, text: str) -> set[str]:
|
||||
values: set[str] = set(COMMON_PATH_SEGMENTS)
|
||||
for match in FILE_PATH_RE.finditer(text or ""):
|
||||
for part in re.split(r"[/.]+", match.group(0).lower()):
|
||||
if part:
|
||||
values.add(part)
|
||||
return values
|
||||
50
app/modules/rag/intent_router_v2/keyword_hint_sanitizer.py
Normal file
50
app/modules/rag/intent_router_v2/keyword_hint_sanitizer.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import QueryAnchor
|
||||
|
||||
|
||||
class KeywordHintSanitizer:
|
||||
_GENERIC_KEY_TERMS = {"файл", "класс", "метод", "функция", "документация"}
|
||||
_DOMAIN_ALLOWLIST = {"RAG", "API", "HTTP", "SQL"}
|
||||
_DIR_SCOPE_MARKERS = ("в папке", "в директории", "в каталоге")
|
||||
|
||||
def sanitize(self, raw: str, anchors: list[QueryAnchor], base_hints: list[str]) -> list[str]:
|
||||
text = (raw or "").lower()
|
||||
allow_dirs = any(marker in text for marker in self._DIR_SCOPE_MARKERS)
|
||||
file_paths = [anchor.value for anchor in anchors if anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value)]
|
||||
known_dirs = {path.rsplit("/", 1)[0] for path in file_paths if "/" in path}
|
||||
result: list[str] = []
|
||||
|
||||
for anchor in anchors:
|
||||
if anchor.type == "FILE_PATH":
|
||||
if self._looks_like_directory(anchor.value):
|
||||
if not allow_dirs and (known_dirs or file_paths):
|
||||
continue
|
||||
self._append(result, anchor.value)
|
||||
if anchor.type == "SYMBOL":
|
||||
self._append(result, anchor.value)
|
||||
|
||||
for token in base_hints:
|
||||
if token in self._DOMAIN_ALLOWLIST:
|
||||
self._append(result, token)
|
||||
continue
|
||||
lowered = token.lower()
|
||||
if lowered in self._GENERIC_KEY_TERMS:
|
||||
continue
|
||||
if token in known_dirs and not allow_dirs:
|
||||
continue
|
||||
if "/" in token and "." not in token and not allow_dirs and file_paths:
|
||||
continue
|
||||
self._append(result, token)
|
||||
return result[:8]
|
||||
|
||||
def _append(self, values: list[str], candidate: str) -> None:
|
||||
if candidate and candidate not in values:
|
||||
values.append(candidate)
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
|
||||
def _looks_like_directory(self, value: str) -> bool:
|
||||
return "/" in (value or "") and not self._looks_like_file(value)
|
||||
29
app/modules/rag/intent_router_v2/layer_query_builder.py
Normal file
29
app/modules/rag/intent_router_v2/layer_query_builder.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import LayerQuery, RepoContext
|
||||
|
||||
|
||||
class LayerQueryBuilder:
|
||||
def build(self, intent: str, repo_context: RepoContext, *, domains: list[str], layers_map: dict[str, list[tuple[str, int]]]) -> list[LayerQuery]:
|
||||
available = set(repo_context.available_layers or [])
|
||||
result: list[LayerQuery] = []
|
||||
for layer_id, top_k in layers_map[intent]:
|
||||
if not self._layer_matches_domains(layer_id, domains):
|
||||
continue
|
||||
if available and layer_id not in available:
|
||||
continue
|
||||
result.append(LayerQuery(layer_id=layer_id, top_k=top_k))
|
||||
if result:
|
||||
return result
|
||||
return [
|
||||
LayerQuery(layer_id=layer_id, top_k=top_k)
|
||||
for layer_id, top_k in layers_map[intent]
|
||||
if self._layer_matches_domains(layer_id, domains)
|
||||
]
|
||||
|
||||
def _layer_matches_domains(self, layer_id: str, domains: list[str]) -> bool:
|
||||
if domains == ["CODE"]:
|
||||
return layer_id.startswith("C")
|
||||
if domains == ["DOCS"]:
|
||||
return layer_id.startswith("D")
|
||||
return layer_id.startswith("C") or layer_id.startswith("D")
|
||||
25
app/modules/rag/intent_router_v2/local_runner.py
Normal file
25
app/modules/rag/intent_router_v2/local_runner.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntentRouterScenarioRunner:
|
||||
def __init__(self, router: IntentRouterV2) -> None:
|
||||
self._router = router
|
||||
|
||||
def run(self, queries: list[str], repo_context: RepoContext | None = None) -> list[IntentRouterResult]:
|
||||
state = ConversationState()
|
||||
context = repo_context or RepoContext()
|
||||
results: list[IntentRouterResult] = []
|
||||
for index, user_query in enumerate(queries, start=1):
|
||||
LOGGER.warning("intent router local input: turn=%s user_query=%s", index, user_query)
|
||||
result = self._router.route(user_query, state, context)
|
||||
LOGGER.warning("intent router local output: turn=%s result=%s", index, result.model_dump_json(ensure_ascii=False))
|
||||
results.append(result)
|
||||
state = state.advance(result)
|
||||
return results
|
||||
22
app/modules/rag/intent_router_v2/logger.py
Normal file
22
app/modules/rag/intent_router_v2/logger.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntentRouterLogger:
|
||||
def log_request(self, user_query: str, conversation_state: ConversationState, repo_context: RepoContext) -> None:
|
||||
LOGGER.warning(
|
||||
"intent router v2 request: turn=%s active_intent=%s user_query=%s languages=%s domains=%s",
|
||||
conversation_state.turn_index + 1,
|
||||
conversation_state.active_intent,
|
||||
" ".join((user_query or "").split()),
|
||||
repo_context.languages,
|
||||
repo_context.available_domains,
|
||||
)
|
||||
|
||||
def log_result(self, result: IntentRouterResult) -> None:
|
||||
LOGGER.warning("intent router v2 result: %s", result.model_dump_json(ensure_ascii=False))
|
||||
182
app/modules/rag/intent_router_v2/models.py
Normal file
182
app/modules/rag/intent_router_v2/models.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||
|
||||
|
||||
IntentType = Literal["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"]
|
||||
ConversationMode = Literal["START", "CONTINUE", "SWITCH"]
|
||||
AnchorType = Literal["FILE_PATH", "SYMBOL", "DOC_REF", "KEY_TERM"]
|
||||
AnchorSource = Literal["user_text", "conversation_state", "heuristic"]
|
||||
_INLINE_CODE_RE = re.compile(r"`([^`]*)`")
|
||||
_CODE_SYMBOL_RE = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b")
|
||||
|
||||
|
||||
class AnchorSpan(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
start: int = 0
|
||||
end: int = 0
|
||||
|
||||
|
||||
class QueryAnchor(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
type: AnchorType
|
||||
value: str
|
||||
source: AnchorSource = "user_text"
|
||||
subtype: str | None = None
|
||||
span: AnchorSpan | None = None
|
||||
confidence: float = 0.0
|
||||
|
||||
@field_validator("confidence")
|
||||
@classmethod
|
||||
def clamp_confidence(cls, value: float) -> float:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
|
||||
|
||||
class QueryPlan(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
raw: str
|
||||
normalized: str
|
||||
sub_intent: str = "EXPLAIN"
|
||||
negations: list[str] = Field(default_factory=list)
|
||||
expansions: list[str] = Field(default_factory=list)
|
||||
keyword_hints: list[str] = Field(default_factory=list)
|
||||
anchors: list[QueryAnchor] = Field(default_factory=list)
|
||||
|
||||
|
||||
class LayerQuery(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
layer_id: str
|
||||
top_k: int
|
||||
|
||||
|
||||
class CodeRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
test_policy: str = "EXCLUDE"
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class DocsRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
doc_kinds: list[str] = Field(default_factory=list)
|
||||
doc_language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class HybridRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
test_policy: str = "EXCLUDE"
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
language: list[str] = Field(default_factory=list)
|
||||
doc_kinds: list[str] = Field(default_factory=list)
|
||||
doc_language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class RetrievalSpec(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
domains: list[str] = Field(default_factory=list)
|
||||
layer_queries: list[LayerQuery] = Field(default_factory=list)
|
||||
filters: CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters = Field(default_factory=CodeRetrievalFilters)
|
||||
rerank_profile: str = ""
|
||||
|
||||
|
||||
class EvidencePolicy(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
require_def: bool = False
|
||||
require_flow: bool = False
|
||||
require_spec: bool = False
|
||||
allow_answer_without_evidence: bool = False
|
||||
|
||||
|
||||
class IntentRouterResult(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
schema_version: str = "1.1"
|
||||
intent: IntentType
|
||||
graph_id: str
|
||||
conversation_mode: ConversationMode
|
||||
query_plan: QueryPlan
|
||||
retrieval_spec: RetrievalSpec
|
||||
evidence_policy: EvidencePolicy
|
||||
|
||||
|
||||
class ConversationState(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
active_intent: IntentType | None = None
|
||||
active_domain: str | None = None
|
||||
active_anchors: list[QueryAnchor] = Field(default_factory=list)
|
||||
active_symbol: str | None = None
|
||||
active_path_scope: list[str] = Field(default_factory=list)
|
||||
active_code_span_symbols: list[str] = Field(default_factory=list)
|
||||
last_query: str = ""
|
||||
turn_index: int = 0
|
||||
|
||||
def advance(self, result: IntentRouterResult) -> "ConversationState":
|
||||
user_anchors = [anchor for anchor in result.query_plan.anchors if anchor.source == "user_text"]
|
||||
symbol_candidates = [anchor.value for anchor in user_anchors if anchor.type == "SYMBOL"]
|
||||
has_user_file_anchor = any(anchor.type == "FILE_PATH" for anchor in user_anchors)
|
||||
if symbol_candidates:
|
||||
active_symbol = symbol_candidates[-1]
|
||||
elif has_user_file_anchor:
|
||||
active_symbol = None
|
||||
else:
|
||||
active_symbol = self.active_symbol
|
||||
raw_code_symbols = _extract_code_symbols(result.query_plan.raw)
|
||||
active_code_span_symbols = raw_code_symbols or list(self.active_code_span_symbols)
|
||||
path_scope = list(getattr(result.retrieval_spec.filters, "path_scope", []) or [])
|
||||
active_domains = list(result.retrieval_spec.domains or [])
|
||||
active_domain = active_domains[0] if len(active_domains) == 1 else self.active_domain
|
||||
return ConversationState(
|
||||
active_intent=result.intent,
|
||||
active_domain=active_domain,
|
||||
active_anchors=list(user_anchors),
|
||||
active_symbol=active_symbol,
|
||||
active_path_scope=path_scope or list(self.active_path_scope),
|
||||
active_code_span_symbols=active_code_span_symbols,
|
||||
last_query=result.query_plan.raw,
|
||||
turn_index=self.turn_index + 1,
|
||||
)
|
||||
|
||||
|
||||
class RepoContext(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
languages: list[str] = Field(default_factory=list)
|
||||
available_domains: list[str] = Field(default_factory=lambda: ["CODE", "DOCS"])
|
||||
available_layers: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class IntentDecision(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
intent: IntentType
|
||||
confidence: float = 0.0
|
||||
reason: str = ""
|
||||
|
||||
@field_validator("confidence")
|
||||
@classmethod
|
||||
def clamp_confidence(cls, value: float) -> float:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
|
||||
|
||||
def _extract_code_symbols(raw: str) -> list[str]:
|
||||
symbols: list[str] = []
|
||||
for match in _INLINE_CODE_RE.finditer(raw or ""):
|
||||
snippet = match.group(1)
|
||||
for token in _CODE_SYMBOL_RE.findall(snippet):
|
||||
if token not in symbols:
|
||||
symbols.append(token)
|
||||
return symbols[:8]
|
||||
17
app/modules/rag/intent_router_v2/negation_detector.py
Normal file
17
app/modules/rag/intent_router_v2/negation_detector.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_TEST_NEG_RE = re.compile(
|
||||
r"(?:не\s+про\s+тест|без\s+тест|кроме\s+тест|про\s+прод\s+код|только\s+прод|production\s+code)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class NegationDetector:
|
||||
def detect(self, text: str) -> set[str]:
|
||||
lowered = (text or "").lower()
|
||||
negations: set[str] = set()
|
||||
if _TEST_NEG_RE.search(lowered):
|
||||
negations.add("tests")
|
||||
return negations
|
||||
57
app/modules/rag/intent_router_v2/normalization.py
Normal file
57
app/modules/rag/intent_router_v2/normalization.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
CODE_SPAN_RE = re.compile(r"`[^`]*`")
|
||||
FILE_PATH_RE = re.compile(
|
||||
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
|
||||
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
|
||||
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
|
||||
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
|
||||
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
|
||||
WS_RE = re.compile(r"\s+")
|
||||
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "“": '"', "”": '"', "‘": "'", "’": "'"})
|
||||
|
||||
|
||||
class QueryNormalizer:
|
||||
"""Builds a deterministic canonical form from raw text without enrichment."""
|
||||
|
||||
def normalize(self, raw: str) -> str:
|
||||
text = raw or ""
|
||||
protected = _ProtectedText()
|
||||
text = self._protect(text, protected)
|
||||
text = self._collapse_whitespace(text)
|
||||
text = text.translate(QUOTE_TRANSLATION)
|
||||
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
|
||||
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
|
||||
text = self._collapse_whitespace(text)
|
||||
return protected.restore(text)
|
||||
|
||||
def _protect(self, text: str, protected: "_ProtectedText") -> str:
|
||||
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
|
||||
text = pattern.sub(protected.replace, text)
|
||||
return text
|
||||
|
||||
def _collapse_whitespace(self, text: str) -> str:
|
||||
return WS_RE.sub(" ", text).strip()
|
||||
|
||||
|
||||
class _ProtectedText:
|
||||
def __init__(self) -> None:
|
||||
self._items: dict[str, str] = {}
|
||||
self._index = 0
|
||||
|
||||
def replace(self, match: re.Match[str]) -> str:
|
||||
placeholder = f"@@P{self._index}@@"
|
||||
self._items[placeholder] = match.group(0)
|
||||
self._index += 1
|
||||
return placeholder
|
||||
|
||||
def restore(self, text: str) -> str:
|
||||
restored = text
|
||||
for placeholder, value in self._items.items():
|
||||
restored = restored.replace(placeholder, value)
|
||||
return restored
|
||||
48
app/modules/rag/intent_router_v2/normalization_terms.py
Normal file
48
app/modules/rag/intent_router_v2/normalization_terms.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class KeyTermCanonicalizer:
|
||||
_ALIASES: dict[str, set[str]] = {
|
||||
"файл": {
|
||||
"файл",
|
||||
"файла",
|
||||
"файле",
|
||||
"файлу",
|
||||
"файлом",
|
||||
"файлы",
|
||||
"файлов",
|
||||
"файлам",
|
||||
"файлами",
|
||||
},
|
||||
"класс": {"класс", "класса", "классе", "классу", "классом", "классы", "классов", "классам"},
|
||||
"функция": {"функция", "функции", "функцию", "функцией", "функциях"},
|
||||
"метод": {"метод", "метода", "методе", "методу", "методом", "методы"},
|
||||
"документация": {"документация", "документации", "документацию"},
|
||||
"тест": {"тест", "тесты", "тестов", "тестам", "тестами", "юнит-тест", "юниттест"},
|
||||
"модуль": {"модуль", "модуля"},
|
||||
"пакет": {"пакет"},
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._token_to_canonical = self._build_index()
|
||||
|
||||
def canonicalize(self, token: str) -> str | None:
|
||||
return self._token_to_canonical.get((token or "").lower())
|
||||
|
||||
def aliases(self) -> set[str]:
|
||||
values: set[str] = set()
|
||||
for forms in self._ALIASES.values():
|
||||
values.update(forms)
|
||||
return values
|
||||
|
||||
def is_test_term(self, token: str) -> bool:
|
||||
canonical = self.canonicalize(token)
|
||||
return canonical == "тест"
|
||||
|
||||
def _build_index(self) -> dict[str, str]:
|
||||
index: dict[str, str] = {}
|
||||
for canonical, forms in self._ALIASES.items():
|
||||
index[canonical] = canonical
|
||||
for form in forms:
|
||||
index[form] = canonical
|
||||
return index
|
||||
7
app/modules/rag/intent_router_v2/protocols.py
Normal file
7
app/modules/rag/intent_router_v2/protocols.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class TextGenerator(Protocol):
|
||||
def generate(self, prompt_name: str, user_input: str, *, log_context: str | None = None) -> str: ...
|
||||
3
app/modules/rag/intent_router_v2/query_normalizer.py
Normal file
3
app/modules/rag/intent_router_v2/query_normalizer.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
|
||||
|
||||
__all__ = ["QueryNormalizer"]
|
||||
223
app/modules/rag/intent_router_v2/query_plan_builder.py
Normal file
223
app/modules/rag/intent_router_v2/query_plan_builder.py
Normal file
@@ -0,0 +1,223 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
|
||||
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
|
||||
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
|
||||
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
|
||||
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
|
||||
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
|
||||
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
|
||||
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
|
||||
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
|
||||
|
||||
|
||||
class QueryPlanBuilder:
|
||||
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
|
||||
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
|
||||
def __init__(
|
||||
self,
|
||||
normalizer: QueryNormalizer | None = None,
|
||||
extractor: AnchorExtractor | None = None,
|
||||
mapper: RuEnTermMapper | None = None,
|
||||
keyword_hints: KeywordHintBuilder | None = None,
|
||||
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
|
||||
carryover: ConversationAnchorBuilder | None = None,
|
||||
span_validator: AnchorSpanValidator | None = None,
|
||||
sub_intent_detector: SubIntentDetector | None = None,
|
||||
negation_detector: NegationDetector | None = None,
|
||||
) -> None:
|
||||
self._normalizer = normalizer or QueryNormalizer()
|
||||
self._extractor = extractor or AnchorExtractor()
|
||||
self._mapper = mapper or RuEnTermMapper()
|
||||
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
|
||||
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
|
||||
self._carryover = carryover or ConversationAnchorBuilder()
|
||||
self._span_validator = span_validator or AnchorSpanValidator()
|
||||
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
|
||||
self._negation_detector = negation_detector or NegationDetector()
|
||||
|
||||
def build(
|
||||
self,
|
||||
user_query: str,
|
||||
conversation_state: ConversationState,
|
||||
continue_mode: bool,
|
||||
*,
|
||||
conversation_mode: str = "START",
|
||||
intent: str = "PROJECT_MISC",
|
||||
) -> QueryPlan:
|
||||
raw = user_query or ""
|
||||
normalized = self._normalizer.normalize(raw)
|
||||
if not normalized and raw.strip():
|
||||
normalized = raw
|
||||
negations = self._negation_detector.detect(normalized)
|
||||
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
|
||||
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
|
||||
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
|
||||
merged_anchors = self._merge_anchors(
|
||||
raw,
|
||||
user_anchors,
|
||||
conversation_state,
|
||||
continue_mode,
|
||||
conversation_mode=conversation_mode,
|
||||
intent=intent,
|
||||
)
|
||||
skip_tests = "tests" in negations or is_negative_test_request(raw)
|
||||
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
|
||||
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
|
||||
if intent == "DOCS_QA":
|
||||
sub_intent = "EXPLAIN"
|
||||
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
|
||||
keyword_hints = self._keyword_hints(
|
||||
raw,
|
||||
normalized,
|
||||
cleaned_anchors,
|
||||
skip_tests=skip_tests,
|
||||
intent=intent,
|
||||
state=conversation_state,
|
||||
)
|
||||
return QueryPlan(
|
||||
raw=raw,
|
||||
normalized=normalized,
|
||||
sub_intent=sub_intent,
|
||||
negations=sorted(negations),
|
||||
expansions=expansions,
|
||||
keyword_hints=keyword_hints,
|
||||
anchors=cleaned_anchors,
|
||||
)
|
||||
|
||||
def _merge_anchors(
|
||||
self,
|
||||
raw: str,
|
||||
anchors: list[QueryAnchor],
|
||||
state: ConversationState,
|
||||
continue_mode: bool,
|
||||
*,
|
||||
conversation_mode: str,
|
||||
intent: str,
|
||||
) -> list[QueryAnchor]:
|
||||
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
|
||||
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
|
||||
inherited = self._carryover.build(
|
||||
raw,
|
||||
state,
|
||||
continue_mode=continue_mode,
|
||||
has_user_symbol=has_user_symbol,
|
||||
has_user_file_path=has_user_file,
|
||||
)
|
||||
if (
|
||||
conversation_mode == "SWITCH"
|
||||
and intent == "DOCS_QA"
|
||||
and not has_user_file
|
||||
and not has_user_symbol
|
||||
and state.active_symbol
|
||||
):
|
||||
inherited.append(
|
||||
QueryAnchor(
|
||||
type="SYMBOL",
|
||||
value=state.active_symbol,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.62,
|
||||
)
|
||||
)
|
||||
return self._dedupe(anchors + inherited)
|
||||
|
||||
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
|
||||
values = self._mapper.expand(normalized)
|
||||
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
|
||||
if has_symbol:
|
||||
values = [value for value in values if value.lower() not in {"def", "class"}]
|
||||
if not skip_tests and has_test_focus(normalized):
|
||||
for candidate in ("test", "unit test"):
|
||||
if candidate not in values:
|
||||
values.append(candidate)
|
||||
for anchor in anchors:
|
||||
if anchor.type == "SYMBOL" and anchor.value not in values:
|
||||
values.append(anchor.value)
|
||||
if skip_tests:
|
||||
values = [value for value in values if not is_test_related_token(value)]
|
||||
return values[:16]
|
||||
|
||||
def _keyword_hints(
|
||||
self,
|
||||
raw: str,
|
||||
normalized: str,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
skip_tests: bool,
|
||||
intent: str,
|
||||
state: ConversationState,
|
||||
) -> list[str]:
|
||||
values = self._keyword_hints_builder.build(normalized)
|
||||
for anchor in anchors:
|
||||
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
|
||||
continue
|
||||
candidate = anchor.value
|
||||
if candidate not in values:
|
||||
values.append(candidate)
|
||||
if skip_tests:
|
||||
values = [value for value in values if not is_test_related_token(value)]
|
||||
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
|
||||
if intent == "DOCS_QA" and not sanitized:
|
||||
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
|
||||
sanitized = fallback[:3]
|
||||
if state.active_symbol and state.active_symbol not in sanitized:
|
||||
sanitized.append(state.active_symbol)
|
||||
sanitized = sanitized[:5]
|
||||
return sanitized
|
||||
|
||||
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
if not skip_tests:
|
||||
return anchors
|
||||
result: list[QueryAnchor] = []
|
||||
for anchor in anchors:
|
||||
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
|
||||
result.append(anchor)
|
||||
continue
|
||||
if is_test_related_token(anchor.value):
|
||||
continue
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
seen: set[tuple[str, str, str | None, str]] = set()
|
||||
for anchor in anchors:
|
||||
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _resolve_sub_intent(
|
||||
self,
|
||||
candidate: str,
|
||||
raw: str,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
intent: str,
|
||||
negations: set[str],
|
||||
) -> str:
|
||||
if candidate != "EXPLAIN":
|
||||
return candidate
|
||||
if intent != "CODE_QA":
|
||||
return candidate
|
||||
text = " ".join((raw or "").lower().split())
|
||||
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
|
||||
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
|
||||
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
|
||||
is_why = any(marker in text for marker in self._WHY_MARKERS)
|
||||
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
|
||||
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
|
||||
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
|
||||
return "EXPLAIN_LOCAL"
|
||||
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
|
||||
return "EXPLAIN_LOCAL"
|
||||
return candidate
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
111
app/modules/rag/intent_router_v2/retrieval_filter_builder.py
Normal file
111
app/modules/rag/intent_router_v2/retrieval_filter_builder.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import (
|
||||
CodeRetrievalFilters,
|
||||
ConversationState,
|
||||
DocsRetrievalFilters,
|
||||
HybridRetrievalFilters,
|
||||
QueryAnchor,
|
||||
RepoContext,
|
||||
)
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
|
||||
|
||||
|
||||
class RetrievalFilterBuilder:
|
||||
def build(
|
||||
self,
|
||||
domains: list[str],
|
||||
anchors: list[QueryAnchor],
|
||||
repo_context: RepoContext,
|
||||
*,
|
||||
raw_query: str,
|
||||
conversation_state: ConversationState | None,
|
||||
conversation_mode: str,
|
||||
sub_intent: str = "EXPLAIN",
|
||||
) -> CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters:
|
||||
path_scope = self._path_scope(
|
||||
anchors,
|
||||
conversation_state=conversation_state,
|
||||
conversation_mode=conversation_mode,
|
||||
raw_query=raw_query,
|
||||
sub_intent=sub_intent,
|
||||
)
|
||||
if domains == ["DOCS"]:
|
||||
return DocsRetrievalFilters(
|
||||
path_scope=path_scope,
|
||||
doc_kinds=self._doc_kinds(anchors, raw_query),
|
||||
doc_language=[],
|
||||
)
|
||||
if domains == ["CODE"]:
|
||||
return CodeRetrievalFilters(
|
||||
test_policy=self._test_policy(raw_query, anchors),
|
||||
path_scope=path_scope,
|
||||
language=list(repo_context.languages),
|
||||
)
|
||||
return HybridRetrievalFilters(
|
||||
test_policy=self._test_policy(raw_query, anchors),
|
||||
path_scope=path_scope,
|
||||
language=list(repo_context.languages),
|
||||
doc_kinds=self._doc_kinds(anchors, raw_query),
|
||||
doc_language=[],
|
||||
)
|
||||
|
||||
def _test_policy(self, raw_query: str, anchors: list[QueryAnchor]) -> str:
|
||||
if is_negative_test_request(raw_query):
|
||||
return "EXCLUDE"
|
||||
if has_test_focus(raw_query):
|
||||
return "INCLUDE"
|
||||
has_test_keyterm = any(anchor.type == "KEY_TERM" and is_test_related_token(anchor.value) for anchor in anchors)
|
||||
return "INCLUDE" if has_test_keyterm else "EXCLUDE"
|
||||
|
||||
def _path_scope(
|
||||
self,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
conversation_state: ConversationState | None,
|
||||
conversation_mode: str,
|
||||
raw_query: str,
|
||||
sub_intent: str,
|
||||
) -> list[str]:
|
||||
values: list[str] = []
|
||||
has_user_file_anchor = False
|
||||
file_values: list[str] = []
|
||||
for anchor in anchors:
|
||||
if anchor.type != "FILE_PATH":
|
||||
continue
|
||||
if anchor.source == "user_text":
|
||||
has_user_file_anchor = True
|
||||
if anchor.value not in values:
|
||||
values.append(anchor.value)
|
||||
if self._looks_like_file_path(anchor.value) and anchor.value not in file_values:
|
||||
file_values.append(anchor.value)
|
||||
parent = anchor.value.rsplit("/", 1)[0] if "/" in anchor.value and self._looks_like_file_path(anchor.value) else ""
|
||||
if parent and parent not in values:
|
||||
values.append(parent)
|
||||
if sub_intent in {"OPEN_FILE", "EXPLAIN_LOCAL"} and file_values and not self._is_explicit_directory_scope(raw_query):
|
||||
return file_values[:6]
|
||||
if has_user_file_anchor or conversation_mode != "CONTINUE":
|
||||
return values[:6]
|
||||
if values:
|
||||
return values[:6]
|
||||
inherited = list((conversation_state.active_path_scope if conversation_state else []) or [])
|
||||
return inherited[:6]
|
||||
|
||||
def _doc_kinds(self, anchors: list[QueryAnchor], raw_query: str) -> list[str]:
|
||||
text = (raw_query or "").lower()
|
||||
kinds: list[str] = []
|
||||
has_readme = "readme" in text or any(
|
||||
anchor.type in {"DOC_REF", "FILE_PATH"} and anchor.value.lower().endswith("readme.md")
|
||||
for anchor in anchors
|
||||
)
|
||||
if has_readme:
|
||||
kinds.append("README")
|
||||
return kinds
|
||||
|
||||
def _looks_like_file_path(self, value: str) -> bool:
|
||||
filename = value.rsplit("/", 1)[-1]
|
||||
return "." in filename
|
||||
|
||||
def _is_explicit_directory_scope(self, raw_query: str) -> bool:
|
||||
text = (raw_query or "").lower()
|
||||
return any(marker in text for marker in ("в папке", "в директории", "в каталоге"))
|
||||
118
app/modules/rag/intent_router_v2/retrieval_spec_factory.py
Normal file
118
app/modules/rag/intent_router_v2/retrieval_spec_factory.py
Normal file
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts.enums import RagLayer
|
||||
from app.modules.rag.intent_router_v2.layer_query_builder import LayerQueryBuilder
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, RepoContext, RetrievalSpec
|
||||
from app.modules.rag.intent_router_v2.retrieval_filter_builder import RetrievalFilterBuilder
|
||||
|
||||
|
||||
class RetrievalSpecFactory:
|
||||
_LAYERS = {
|
||||
"CODE_QA": [
|
||||
(RagLayer.CODE_ENTRYPOINTS, 6),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 8),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 6),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 8),
|
||||
],
|
||||
"DOCS_QA": [
|
||||
(RagLayer.DOCS_MODULE_CATALOG, 5),
|
||||
(RagLayer.DOCS_FACT_INDEX, 8),
|
||||
(RagLayer.DOCS_SECTION_INDEX, 8),
|
||||
(RagLayer.DOCS_POLICY_INDEX, 4),
|
||||
],
|
||||
"GENERATE_DOCS_FROM_CODE": [
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 12),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 8),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_ENTRYPOINTS, 6),
|
||||
],
|
||||
"PROJECT_MISC": [
|
||||
(RagLayer.DOCS_MODULE_CATALOG, 4),
|
||||
(RagLayer.DOCS_SECTION_INDEX, 6),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 4),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 4),
|
||||
],
|
||||
}
|
||||
_DOMAINS = {
|
||||
"CODE_QA": ["CODE"],
|
||||
"DOCS_QA": ["DOCS"],
|
||||
"GENERATE_DOCS_FROM_CODE": ["CODE"],
|
||||
"PROJECT_MISC": ["CODE", "DOCS"],
|
||||
}
|
||||
_RERANK = {
|
||||
"CODE_QA": "code",
|
||||
"DOCS_QA": "docs",
|
||||
"GENERATE_DOCS_FROM_CODE": "generate",
|
||||
"PROJECT_MISC": "project",
|
||||
}
|
||||
_OPEN_FILE_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
]
|
||||
_OPEN_FILE_WITH_SYMBOL_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 6),
|
||||
]
|
||||
_EXPLAIN_LOCAL_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 8),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 4),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer_builder: LayerQueryBuilder | None = None,
|
||||
filter_builder: RetrievalFilterBuilder | None = None,
|
||||
) -> None:
|
||||
self._layer_builder = layer_builder or LayerQueryBuilder()
|
||||
self._filter_builder = filter_builder or RetrievalFilterBuilder()
|
||||
|
||||
def build(
|
||||
self,
|
||||
intent: str,
|
||||
anchors: list[QueryAnchor],
|
||||
repo_context: RepoContext,
|
||||
*,
|
||||
raw_query: str = "",
|
||||
conversation_state: ConversationState | None = None,
|
||||
conversation_mode: str = "START",
|
||||
sub_intent: str = "EXPLAIN",
|
||||
) -> RetrievalSpec:
|
||||
domains = self._domains(intent, repo_context)
|
||||
layers_map = self._with_sub_intent_layers(intent, sub_intent, anchors)
|
||||
layer_queries = self._layer_builder.build(intent, repo_context, domains=domains, layers_map=layers_map)
|
||||
filters = self._filter_builder.build(
|
||||
domains,
|
||||
anchors,
|
||||
repo_context,
|
||||
raw_query=raw_query,
|
||||
conversation_state=conversation_state,
|
||||
conversation_mode=conversation_mode,
|
||||
sub_intent=sub_intent,
|
||||
)
|
||||
return RetrievalSpec(
|
||||
domains=domains,
|
||||
layer_queries=layer_queries,
|
||||
filters=filters,
|
||||
rerank_profile=self._RERANK[intent],
|
||||
)
|
||||
|
||||
def _domains(self, intent: str, repo_context: RepoContext) -> list[str]:
|
||||
available = set(repo_context.available_domains or ["CODE", "DOCS"])
|
||||
result = [domain for domain in self._DOMAINS[intent] if domain in available]
|
||||
return result or list(self._DOMAINS[intent])
|
||||
|
||||
def _with_sub_intent_layers(
|
||||
self,
|
||||
intent: str,
|
||||
sub_intent: str,
|
||||
anchors: list[QueryAnchor],
|
||||
) -> dict[str, list[tuple[str, int]]]:
|
||||
if intent != "CODE_QA":
|
||||
return self._LAYERS
|
||||
layers_map = dict(self._LAYERS)
|
||||
if sub_intent == "OPEN_FILE":
|
||||
has_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
|
||||
layers_map["CODE_QA"] = list(self._OPEN_FILE_WITH_SYMBOL_LAYERS if has_symbol else self._OPEN_FILE_LAYERS)
|
||||
elif sub_intent == "EXPLAIN_LOCAL":
|
||||
layers_map["CODE_QA"] = list(self._EXPLAIN_LOCAL_LAYERS)
|
||||
return layers_map
|
||||
72
app/modules/rag/intent_router_v2/router.py
Normal file
72
app/modules/rag/intent_router_v2/router.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
|
||||
from app.modules.rag.intent_router_v2.conversation_policy import ConversationPolicy
|
||||
from app.modules.rag.intent_router_v2.evidence_policy_factory import EvidencePolicyFactory
|
||||
from app.modules.rag.intent_router_v2.graph_id_resolver import GraphIdResolver
|
||||
from app.modules.rag.intent_router_v2.logger import IntentRouterLogger
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
from app.modules.rag.intent_router_v2.query_plan_builder import QueryPlanBuilder
|
||||
from app.modules.rag.intent_router_v2.retrieval_spec_factory import RetrievalSpecFactory
|
||||
|
||||
|
||||
class IntentRouterV2:
|
||||
def __init__(
|
||||
self,
|
||||
classifier: IntentClassifierV2 | None = None,
|
||||
conversation_policy: ConversationPolicy | None = None,
|
||||
query_plan_builder: QueryPlanBuilder | None = None,
|
||||
retrieval_factory: RetrievalSpecFactory | None = None,
|
||||
evidence_factory: EvidencePolicyFactory | None = None,
|
||||
graph_resolver: GraphIdResolver | None = None,
|
||||
logger: IntentRouterLogger | None = None,
|
||||
) -> None:
|
||||
self._classifier = classifier or IntentClassifierV2()
|
||||
self._conversation_policy = conversation_policy or ConversationPolicy()
|
||||
self._query_plan_builder = query_plan_builder or QueryPlanBuilder()
|
||||
self._retrieval_factory = retrieval_factory or RetrievalSpecFactory()
|
||||
self._evidence_factory = evidence_factory or EvidencePolicyFactory()
|
||||
self._graph_resolver = graph_resolver or GraphIdResolver()
|
||||
self._logger = logger or IntentRouterLogger()
|
||||
|
||||
def route(
|
||||
self,
|
||||
user_query: str,
|
||||
conversation_state: ConversationState | None = None,
|
||||
repo_context: RepoContext | None = None,
|
||||
) -> IntentRouterResult:
|
||||
state = conversation_state or ConversationState()
|
||||
context = repo_context or RepoContext()
|
||||
self._logger.log_request(user_query, state, context)
|
||||
decision = self._classifier.classify(user_query, state)
|
||||
intent, conversation_mode = self._conversation_policy.resolve(decision, user_query, state)
|
||||
query_plan = self._query_plan_builder.build(
|
||||
user_query,
|
||||
state,
|
||||
continue_mode=conversation_mode == "CONTINUE",
|
||||
conversation_mode=conversation_mode,
|
||||
intent=intent,
|
||||
)
|
||||
result = IntentRouterResult(
|
||||
intent=intent,
|
||||
graph_id=self._graph_resolver.resolve(intent),
|
||||
conversation_mode=conversation_mode,
|
||||
query_plan=query_plan,
|
||||
retrieval_spec=self._retrieval_factory.build(
|
||||
intent,
|
||||
query_plan.anchors,
|
||||
context,
|
||||
raw_query=query_plan.raw,
|
||||
conversation_state=state,
|
||||
conversation_mode=conversation_mode,
|
||||
sub_intent=query_plan.sub_intent,
|
||||
),
|
||||
evidence_policy=self._evidence_factory.build(
|
||||
intent,
|
||||
sub_intent=query_plan.sub_intent,
|
||||
negations=query_plan.negations,
|
||||
has_user_anchor=any(anchor.source == "user_text" for anchor in query_plan.anchors),
|
||||
),
|
||||
)
|
||||
self._logger.log_result(result)
|
||||
return result
|
||||
23
app/modules/rag/intent_router_v2/sub_intent_detector.py
Normal file
23
app/modules/rag/intent_router_v2/sub_intent_detector.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class SubIntentDetector:
|
||||
_OPEN_VERBS = ("открой", "посмотри", "проверь", "уточни")
|
||||
_EXPLAIN_MARKERS = ("объясни", "как работает", "почему", "что делает", "зачем", "логика", "флоу", "flow")
|
||||
_TEST_MARKERS = ("тест", "pytest", "unit test", "юнит")
|
||||
|
||||
def detect(self, raw: str, *, has_file_path: bool, negations: set[str]) -> str:
|
||||
text = " ".join((raw or "").lower().split())
|
||||
if not text:
|
||||
return "EXPLAIN"
|
||||
if has_file_path and self._has_open_verb(text) and not self._has_explain_markers(text):
|
||||
return "OPEN_FILE"
|
||||
if "tests" not in negations and any(marker in text for marker in self._TEST_MARKERS):
|
||||
return "FIND_TESTS"
|
||||
return "EXPLAIN"
|
||||
|
||||
def _has_open_verb(self, text: str) -> bool:
|
||||
return any(text.startswith(verb) or f" {verb} " in f" {text} " for verb in self._OPEN_VERBS)
|
||||
|
||||
def _has_explain_markers(self, text: str) -> bool:
|
||||
return any(marker in text for marker in self._EXPLAIN_MARKERS)
|
||||
51
app/modules/rag/intent_router_v2/symbol_rules.py
Normal file
51
app/modules/rag/intent_router_v2/symbol_rules.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
PY_KEYWORDS: set[str] = {
|
||||
"and",
|
||||
"as",
|
||||
"assert",
|
||||
"async",
|
||||
"await",
|
||||
"break",
|
||||
"class",
|
||||
"continue",
|
||||
"def",
|
||||
"del",
|
||||
"elif",
|
||||
"else",
|
||||
"except",
|
||||
"false",
|
||||
"finally",
|
||||
"for",
|
||||
"from",
|
||||
"global",
|
||||
"if",
|
||||
"import",
|
||||
"in",
|
||||
"is",
|
||||
"lambda",
|
||||
"none",
|
||||
"nonlocal",
|
||||
"not",
|
||||
"or",
|
||||
"pass",
|
||||
"raise",
|
||||
"return",
|
||||
"true",
|
||||
"try",
|
||||
"while",
|
||||
"with",
|
||||
"yield",
|
||||
}
|
||||
|
||||
COMMON_PATH_SEGMENTS: set[str] = {
|
||||
"app",
|
||||
"src",
|
||||
"docs",
|
||||
"tests",
|
||||
"module",
|
||||
"modules",
|
||||
"core",
|
||||
"pkg",
|
||||
"lib",
|
||||
}
|
||||
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||||
|
||||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||||
|
||||
|
||||
class RuEnTermMapper:
|
||||
_CANONICAL_MAP = {
|
||||
"класс": ["class"],
|
||||
"метод": ["method"],
|
||||
"функция": ["function", "def"],
|
||||
"модуль": ["module"],
|
||||
"пакет": ["package"],
|
||||
"файл": ["file"],
|
||||
"тест": ["test", "unit test"],
|
||||
"документация": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
}
|
||||
_ENGLISH_SOURCES = {
|
||||
"class": ["class"],
|
||||
"method": ["method"],
|
||||
"function": ["function", "def"],
|
||||
"module": ["module"],
|
||||
"package": ["package"],
|
||||
"file": ["file"],
|
||||
"test": ["test", "unit test"],
|
||||
"tests": ["test", "unit test"],
|
||||
"documentation": ["documentation", "docs"],
|
||||
"docs": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
"def": ["def"],
|
||||
}
|
||||
|
||||
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
|
||||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||||
|
||||
def expand(self, text: str) -> list[str]:
|
||||
expansions: list[str] = []
|
||||
lowered = (text or "").lower()
|
||||
for token in _WORD_RE.findall(lowered):
|
||||
canonical = self._canonicalizer.canonicalize(token) or token
|
||||
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
|
||||
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
|
||||
if "unit test" in lowered or "unit tests" in lowered:
|
||||
self._extend(expansions, self._ENGLISH_SOURCES["test"])
|
||||
return expansions
|
||||
|
||||
def key_terms(self) -> tuple[str, ...]:
|
||||
return tuple(self._CANONICAL_MAP.keys())
|
||||
|
||||
def all_literal_terms(self) -> tuple[str, ...]:
|
||||
values = set(self._canonicalizer.aliases())
|
||||
values.update(self._CANONICAL_MAP.keys())
|
||||
values.update(self._ENGLISH_SOURCES.keys())
|
||||
for targets in self._CANONICAL_MAP.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
for targets in self._ENGLISH_SOURCES.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
return tuple(sorted(values))
|
||||
|
||||
def _extend(self, result: list[str], values: list[str]) -> None:
|
||||
for value in values:
|
||||
if value not in result:
|
||||
result.append(value)
|
||||
40
app/modules/rag/intent_router_v2/test_signals.py
Normal file
40
app/modules/rag/intent_router_v2/test_signals.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_NEGATIVE_TEST_RE = re.compile(r"\b(?:не|без|кроме)\b[^.?!]{0,28}\bтест", re.IGNORECASE)
|
||||
_NEGATIVE_TEST_MARKERS = ("не про тест", "без тест", "кроме тест", "про прод код", "только прод", "production code")
|
||||
_POSITIVE_TEST_MARKERS = (
|
||||
"тест",
|
||||
"tests",
|
||||
"pytest",
|
||||
"unit test",
|
||||
"unit tests",
|
||||
"тестиру",
|
||||
)
|
||||
_TEST_TERMS = {"тест", "тесты", "test", "tests", "pytest", "unit", "unit test", "юнит-тест", "юниттест"}
|
||||
|
||||
|
||||
def is_negative_test_request(text: str) -> bool:
|
||||
lowered = (text or "").lower()
|
||||
if _NEGATIVE_TEST_RE.search(lowered):
|
||||
return True
|
||||
return any(marker in lowered for marker in _NEGATIVE_TEST_MARKERS)
|
||||
|
||||
|
||||
def has_test_focus(text: str) -> bool:
|
||||
lowered = (text or "").lower()
|
||||
if is_negative_test_request(lowered):
|
||||
return False
|
||||
return any(marker in lowered for marker in _POSITIVE_TEST_MARKERS)
|
||||
|
||||
|
||||
def is_test_related_token(value: str) -> bool:
|
||||
lowered = (value or "").lower().strip()
|
||||
if not lowered:
|
||||
return False
|
||||
if lowered in _TEST_TERMS:
|
||||
return True
|
||||
if lowered.startswith("test"):
|
||||
return True
|
||||
return lowered.startswith("тест")
|
||||
Reference in New Issue
Block a user