Фиксация изменений

This commit is contained in:
2026-03-05 11:03:17 +03:00
parent 1ef0b4d68c
commit 417b8b6f72
261 changed files with 8215 additions and 332 deletions

View File

@@ -90,6 +90,41 @@ sequenceDiagram
Rag-->>Agent: items
```
### Retrieval + project/qa reasoning
Назначение: `RAG` вызывается не в начале runtime, а внутри отдельного graph-шага `context_retrieval` для `project/qa`.
```mermaid
sequenceDiagram
participant Agent as GraphAgentRuntime
participant Orch as OrchestratorService
participant G1 as conversation_understanding
participant G2 as question_classification
participant G3 as context_retrieval
participant Rag as RagService
participant G4 as context_analysis
participant G5 as answer_composition
Agent->>Orch: run(task)
Orch->>G1: execute
G1-->>Orch: resolved_request
Orch->>G2: execute
G2-->>Orch: question_profile
Orch->>G3: execute
G3->>Rag: retrieve(query)
Rag-->>G3: rag_items
G3-->>Orch: source_bundle
Orch->>G4: execute
G4-->>Orch: analysis_brief
Orch->>G5: execute
G5-->>Orch: final_answer
Orch-->>Agent: final_answer
```
Для `project/qa` это означает:
- ранний глобальный retrieval больше не нужен;
- `RAG` возвращает записи только для конкретного шага `context_retrieval`;
- оркестратор управляет цепочкой graph-шагов;
- пользовательский ответ собирается после анализа, а не напрямую из сырого retrieval.
## 5. Слои, фиксируемые в RAG
### 5.1. Слои DOCS

View File

@@ -0,0 +1,36 @@
from __future__ import annotations
from importlib import import_module
__all__ = [
"CodeExcerpt",
"CodeExplainRetrieverV2",
"CodeGraphRepository",
"EvidenceItem",
"ExplainIntent",
"ExplainIntentBuilder",
"ExplainPack",
"LayeredRetrievalGateway",
"PromptBudgeter",
"TracePath",
]
def __getattr__(name: str):
module_map = {
"CodeExcerpt": "app.modules.rag.explain.models",
"EvidenceItem": "app.modules.rag.explain.models",
"ExplainIntent": "app.modules.rag.explain.models",
"ExplainPack": "app.modules.rag.explain.models",
"TracePath": "app.modules.rag.explain.models",
"ExplainIntentBuilder": "app.modules.rag.explain.intent_builder",
"PromptBudgeter": "app.modules.rag.explain.budgeter",
"LayeredRetrievalGateway": "app.modules.rag.explain.layered_gateway",
"CodeGraphRepository": "app.modules.rag.explain.graph_repository",
"CodeExplainRetrieverV2": "app.modules.rag.explain.retriever_v2",
}
module_name = module_map.get(name)
if module_name is None:
raise AttributeError(name)
module = import_module(module_name)
return getattr(module, name)

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
import json
from app.modules.rag.explain.models import ExplainPack
class PromptBudgeter:
def __init__(
self,
*,
max_paths: int = 3,
max_symbols: int = 25,
max_excerpts: int = 40,
max_chars: int = 30000,
) -> None:
self._max_paths = max_paths
self._max_symbols = max_symbols
self._max_excerpts = max_excerpts
self._max_chars = max_chars
def build_prompt_input(self, question: str, pack: ExplainPack) -> str:
symbol_ids: list[str] = []
for path in pack.trace_paths[: self._max_paths]:
for symbol_id in path.symbol_ids:
if symbol_id and symbol_id not in symbol_ids and len(symbol_ids) < self._max_symbols:
symbol_ids.append(symbol_id)
excerpts = []
total_chars = 0
for excerpt in pack.code_excerpts:
if symbol_ids and excerpt.symbol_id and excerpt.symbol_id not in symbol_ids:
continue
body = excerpt.content.strip()
remaining = self._max_chars - total_chars
if remaining <= 0 or len(excerpts) >= self._max_excerpts:
break
if len(body) > remaining:
body = body[:remaining].rstrip() + "...[truncated]"
excerpts.append(
{
"evidence_id": excerpt.evidence_id,
"title": excerpt.title,
"path": excerpt.path,
"start_line": excerpt.start_line,
"end_line": excerpt.end_line,
"focus": excerpt.focus,
"content": body,
}
)
total_chars += len(body)
payload = {
"question": question,
"intent": pack.intent.model_dump(mode="json"),
"selected_entrypoints": [item.model_dump(mode="json") for item in pack.selected_entrypoints[:5]],
"seed_symbols": [item.model_dump(mode="json") for item in pack.seed_symbols[: self._max_symbols]],
"trace_paths": [path.model_dump(mode="json") for path in pack.trace_paths[: self._max_paths]],
"evidence_index": {key: value.model_dump(mode="json") for key, value in pack.evidence_index.items()},
"code_excerpts": excerpts,
"missing": pack.missing,
"conflicts": pack.conflicts,
}
return json.dumps(payload, ensure_ascii=False, indent=2)

View File

@@ -0,0 +1,59 @@
from __future__ import annotations
from app.modules.rag.explain.models import CodeExcerpt, LayeredRetrievalItem
class ExcerptPlanner:
_FOCUS_TOKENS = ("raise", "except", "db", "select", "insert", "update", "delete", "http", "publish", "emit")
def plan(self, chunk: LayeredRetrievalItem, *, evidence_id: str, symbol_id: str | None) -> list[CodeExcerpt]:
location = chunk.location
if location is None:
return []
excerpts = [
CodeExcerpt(
evidence_id=evidence_id,
symbol_id=symbol_id,
title=chunk.title,
path=location.path,
start_line=location.start_line,
end_line=location.end_line,
content=chunk.content.strip(),
focus="overview",
)
]
focus = self._focus_excerpt(chunk, evidence_id=evidence_id, symbol_id=symbol_id)
if focus is not None:
excerpts.append(focus)
return excerpts
def _focus_excerpt(
self,
chunk: LayeredRetrievalItem,
*,
evidence_id: str,
symbol_id: str | None,
) -> CodeExcerpt | None:
location = chunk.location
if location is None:
return None
lines = chunk.content.splitlines()
for index, line in enumerate(lines):
lowered = line.lower()
if not any(token in lowered for token in self._FOCUS_TOKENS):
continue
start = max(0, index - 2)
end = min(len(lines), index + 3)
if end - start >= len(lines):
return None
return CodeExcerpt(
evidence_id=evidence_id,
symbol_id=symbol_id,
title=f"{chunk.title}:focus",
path=location.path,
start_line=(location.start_line or 1) + start,
end_line=(location.start_line or 1) + end - 1,
content="\n".join(lines[start:end]).strip(),
focus="focus",
)
return None

View File

@@ -0,0 +1,216 @@
from __future__ import annotations
import json
from sqlalchemy import text
from app.modules.rag.explain.models import CodeLocation, LayeredRetrievalItem
from app.modules.shared.db import get_engine
class CodeGraphRepository:
def get_out_edges(
self,
rag_session_id: str,
src_symbol_ids: list[str],
edge_types: list[str],
limit_per_src: int,
) -> list[LayeredRetrievalItem]:
if not src_symbol_ids:
return []
sql = """
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE rag_session_id = :sid
AND layer = 'C2_DEPENDENCY_GRAPH'
AND CAST(metadata_json AS jsonb)->>'src_symbol_id' = ANY(:src_ids)
AND CAST(metadata_json AS jsonb)->>'edge_type' = ANY(:edge_types)
ORDER BY path, span_start
"""
with get_engine().connect() as conn:
rows = conn.execute(
text(sql),
{"sid": rag_session_id, "src_ids": src_symbol_ids, "edge_types": edge_types},
).mappings().fetchall()
grouped: dict[str, int] = {}
items: list[LayeredRetrievalItem] = []
for row in rows:
metadata = self._loads(row.get("metadata_json"))
src_symbol_id = str(metadata.get("src_symbol_id") or "")
grouped[src_symbol_id] = grouped.get(src_symbol_id, 0) + 1
if grouped[src_symbol_id] > limit_per_src:
continue
items.append(self._to_item(row, metadata))
return items
def get_in_edges(
self,
rag_session_id: str,
dst_symbol_ids: list[str],
edge_types: list[str],
limit_per_dst: int,
) -> list[LayeredRetrievalItem]:
if not dst_symbol_ids:
return []
sql = """
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE rag_session_id = :sid
AND layer = 'C2_DEPENDENCY_GRAPH'
AND CAST(metadata_json AS jsonb)->>'dst_symbol_id' = ANY(:dst_ids)
AND CAST(metadata_json AS jsonb)->>'edge_type' = ANY(:edge_types)
ORDER BY path, span_start
"""
with get_engine().connect() as conn:
rows = conn.execute(
text(sql),
{"sid": rag_session_id, "dst_ids": dst_symbol_ids, "edge_types": edge_types},
).mappings().fetchall()
grouped: dict[str, int] = {}
items: list[LayeredRetrievalItem] = []
for row in rows:
metadata = self._loads(row.get("metadata_json"))
dst_symbol_id = str(metadata.get("dst_symbol_id") or "")
grouped[dst_symbol_id] = grouped.get(dst_symbol_id, 0) + 1
if grouped[dst_symbol_id] > limit_per_dst:
continue
items.append(self._to_item(row, metadata))
return items
def resolve_symbol_by_ref(
self,
rag_session_id: str,
dst_ref: str,
package_hint: str | None = None,
) -> LayeredRetrievalItem | None:
ref = (dst_ref or "").strip()
if not ref:
return None
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT path, content, layer, title, metadata_json, span_start, span_end, qname
FROM rag_chunks
WHERE rag_session_id = :sid
AND layer = 'C1_SYMBOL_CATALOG'
AND (qname = :ref OR title = :ref OR qname LIKE :tail)
ORDER BY path
LIMIT 12
"""
),
{"sid": rag_session_id, "ref": ref, "tail": f"%{ref}"},
).mappings().fetchall()
best: LayeredRetrievalItem | None = None
best_score = -1
for row in rows:
metadata = self._loads(row.get("metadata_json"))
package = str(metadata.get("package_or_module") or "")
score = 0
if str(row.get("qname") or "") == ref:
score += 3
if str(row.get("title") or "") == ref:
score += 2
if package_hint and package.startswith(package_hint):
score += 3
if package_hint and package_hint in str(row.get("path") or ""):
score += 1
if score > best_score:
best = self._to_item(row, metadata)
best_score = score
return best
def get_symbols_by_ids(self, rag_session_id: str, symbol_ids: list[str]) -> list[LayeredRetrievalItem]:
if not symbol_ids:
return []
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE rag_session_id = :sid
AND layer = 'C1_SYMBOL_CATALOG'
AND symbol_id = ANY(:symbol_ids)
ORDER BY path, span_start
"""
),
{"sid": rag_session_id, "symbol_ids": symbol_ids},
).mappings().fetchall()
return [self._to_item(row, self._loads(row.get("metadata_json"))) for row in rows]
def get_chunks_by_symbol_ids(
self,
rag_session_id: str,
symbol_ids: list[str],
prefer_chunk_type: str = "symbol_block",
) -> list[LayeredRetrievalItem]:
symbols = self.get_symbols_by_ids(rag_session_id, symbol_ids)
chunks: list[LayeredRetrievalItem] = []
for symbol in symbols:
location = symbol.location
if location is None:
continue
chunk = self._chunk_for_symbol(rag_session_id, symbol, prefer_chunk_type=prefer_chunk_type)
if chunk is not None:
chunks.append(chunk)
return chunks
def _chunk_for_symbol(
self,
rag_session_id: str,
symbol: LayeredRetrievalItem,
*,
prefer_chunk_type: str,
) -> LayeredRetrievalItem | None:
location = symbol.location
if location is None:
return None
with get_engine().connect() as conn:
rows = conn.execute(
text(
"""
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE rag_session_id = :sid
AND layer = 'C0_SOURCE_CHUNKS'
AND path = :path
AND COALESCE(span_start, 0) <= :end_line
AND COALESCE(span_end, 999999) >= :start_line
ORDER BY
CASE WHEN CAST(metadata_json AS jsonb)->>'chunk_type' = :prefer_chunk_type THEN 0 ELSE 1 END,
ABS(COALESCE(span_start, 0) - :start_line)
LIMIT 1
"""
),
{
"sid": rag_session_id,
"path": location.path,
"start_line": location.start_line or 0,
"end_line": location.end_line or 999999,
"prefer_chunk_type": prefer_chunk_type,
},
).mappings().fetchall()
if not rows:
return None
row = rows[0]
return self._to_item(row, self._loads(row.get("metadata_json")))
def _to_item(self, row, metadata: dict) -> LayeredRetrievalItem:
return LayeredRetrievalItem(
source=str(row.get("path") or ""),
content=str(row.get("content") or ""),
layer=str(row.get("layer") or ""),
title=str(row.get("title") or ""),
metadata=metadata,
location=CodeLocation(
path=str(row.get("path") or ""),
start_line=row.get("span_start"),
end_line=row.get("span_end"),
),
)
def _loads(self, value) -> dict:
if not value:
return {}
return json.loads(str(value))

View File

@@ -0,0 +1,102 @@
from __future__ import annotations
import re
from app.modules.rag.explain.models import ExplainHints, ExplainIntent
from app.modules.rag.retrieval.query_terms import extract_query_terms
class ExplainIntentBuilder:
_ROUTE_RE = re.compile(r"(/[A-Za-z0-9_./{}:-]+)")
_FILE_RE = re.compile(r"([A-Za-z0-9_./-]+\.py)")
_SYMBOL_RE = re.compile(r"\b([A-Z][A-Za-z0-9_]*\.[A-Za-z_][A-Za-z0-9_]*|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b")
_COMMAND_RE = re.compile(r"`([A-Za-z0-9:_-]+)`")
_TEST_KEYWORDS = (
"тест",
"tests",
"test ",
"unit-test",
"unit test",
"юнит-тест",
"pytest",
"spec",
"как покрыто тестами",
"как проверяется",
"how is it tested",
"how it's tested",
)
def build(self, user_query: str) -> ExplainIntent:
normalized = " ".join((user_query or "").split())
lowered = normalized.lower()
keywords = self._keywords(normalized)
hints = ExplainHints(
paths=self._dedupe(self._FILE_RE.findall(normalized)),
symbols=self._symbols(normalized),
endpoints=self._dedupe(self._ROUTE_RE.findall(normalized)),
commands=self._commands(normalized, lowered),
)
return ExplainIntent(
raw_query=user_query,
normalized_query=normalized,
keywords=keywords[:12],
hints=hints,
include_tests=self._include_tests(lowered),
expected_entry_types=self._entry_types(lowered, hints),
depth=self._depth(lowered),
)
def _keywords(self, text: str) -> list[str]:
keywords = extract_query_terms(text)
for token in self._symbols(text):
if token not in keywords:
keywords.append(token)
for token in self._ROUTE_RE.findall(text):
if token not in keywords:
keywords.append(token)
return self._dedupe(keywords)
def _symbols(self, text: str) -> list[str]:
values = []
for raw in self._SYMBOL_RE.findall(text):
token = raw.strip()
if len(token) < 3:
continue
if token.endswith(".py"):
continue
values.append(token)
return self._dedupe(values)
def _commands(self, text: str, lowered: str) -> list[str]:
values = list(self._COMMAND_RE.findall(text))
if " command " in f" {lowered} ":
values.extend(re.findall(r"command\s+([A-Za-z0-9:_-]+)", lowered))
if " cli " in f" {lowered} ":
values.extend(re.findall(r"cli\s+([A-Za-z0-9:_-]+)", lowered))
return self._dedupe(values)
def _entry_types(self, lowered: str, hints: ExplainHints) -> list[str]:
if hints.endpoints or any(token in lowered for token in ("endpoint", "route", "handler", "http", "api")):
return ["http"]
if hints.commands or any(token in lowered for token in ("cli", "command", "click", "typer")):
return ["cli"]
return ["http", "cli"]
def _depth(self, lowered: str) -> str:
if any(token in lowered for token in ("deep", "подроб", "деталь", "full flow", "trace")):
return "deep"
if any(token in lowered for token in ("high level", "overview", "кратко", "summary")):
return "high"
return "medium"
def _include_tests(self, lowered: str) -> bool:
normalized = f" {lowered} "
return any(token in normalized for token in self._TEST_KEYWORDS)
def _dedupe(self, values: list[str]) -> list[str]:
result: list[str] = []
for value in values:
item = value.strip()
if item and item not in result:
result.append(item)
return result

View File

@@ -0,0 +1,289 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Callable
from app.modules.rag.explain.models import CodeLocation, LayeredRetrievalItem
from app.modules.rag.retrieval.test_filter import build_test_filters, debug_disable_test_filter
LOGGER = logging.getLogger(__name__)
if TYPE_CHECKING:
from app.modules.rag.persistence.repository import RagRepository
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder
@dataclass(slots=True)
class LayerRetrievalResult:
items: list[LayeredRetrievalItem]
missing: list[str] = field(default_factory=list)
class LayeredRetrievalGateway:
def __init__(self, repository: RagRepository, embedder: GigaChatEmbedder) -> None:
self._repository = repository
self._embedder = embedder
def retrieve_layer(
self,
rag_session_id: str,
query: str,
layer: str,
*,
limit: int,
path_prefixes: list[str] | None = None,
exclude_tests: bool = True,
prefer_non_tests: bool = False,
include_spans: bool = False,
) -> LayerRetrievalResult:
effective_exclude_tests = exclude_tests and not debug_disable_test_filter()
filter_args = self._filter_args(effective_exclude_tests)
query_embedding: list[float] | None = None
try:
query_embedding = self._embedder.embed([query])[0]
rows = self._repository.retrieve(
rag_session_id,
query_embedding,
query_text=query,
limit=limit,
layers=[layer],
path_prefixes=path_prefixes,
exclude_path_prefixes=filter_args["exclude_path_prefixes"],
exclude_like_patterns=filter_args["exclude_like_patterns"],
prefer_non_tests=prefer_non_tests or not effective_exclude_tests,
)
return self._success_result(
rows,
rag_session_id=rag_session_id,
label="layered retrieval",
include_spans=include_spans,
layer=layer,
exclude_tests=effective_exclude_tests,
path_prefixes=path_prefixes,
)
except Exception as exc:
if query_embedding is None:
self._log_failure(
label="layered retrieval",
rag_session_id=rag_session_id,
layer=layer,
exclude_tests=effective_exclude_tests,
path_prefixes=path_prefixes,
exc=exc,
)
return LayerRetrievalResult(items=[], missing=[self._failure_missing(f"layer:{layer} retrieval_failed", exc)])
retry_result = self._retry_without_test_filter(
operation=lambda: self._repository.retrieve(
rag_session_id,
query_embedding,
query_text=query,
limit=limit,
layers=[layer],
path_prefixes=path_prefixes,
exclude_path_prefixes=None,
exclude_like_patterns=None,
prefer_non_tests=True,
),
label="layered retrieval",
rag_session_id=rag_session_id,
include_spans=include_spans,
layer=layer,
exclude_tests=effective_exclude_tests,
path_prefixes=path_prefixes,
exc=exc,
missing_prefix=f"layer:{layer} retrieval_failed",
)
if retry_result is not None:
return retry_result
return LayerRetrievalResult(items=[], missing=[self._failure_missing(f"layer:{layer} retrieval_failed", exc)])
def retrieve_lexical_code(
self,
rag_session_id: str,
query: str,
*,
limit: int,
path_prefixes: list[str] | None = None,
exclude_tests: bool = True,
include_spans: bool = False,
) -> LayerRetrievalResult:
effective_exclude_tests = exclude_tests and not debug_disable_test_filter()
filter_args = self._filter_args(effective_exclude_tests)
try:
rows = self._repository.retrieve_lexical_code(
rag_session_id,
query_text=query,
limit=limit,
path_prefixes=path_prefixes,
exclude_path_prefixes=filter_args["exclude_path_prefixes"],
exclude_like_patterns=filter_args["exclude_like_patterns"],
prefer_non_tests=not effective_exclude_tests,
)
return self._success_result(
rows,
rag_session_id=rag_session_id,
label="lexical retrieval",
include_spans=include_spans,
exclude_tests=effective_exclude_tests,
path_prefixes=path_prefixes,
)
except Exception as exc:
retry_result = self._retry_without_test_filter(
operation=lambda: self._repository.retrieve_lexical_code(
rag_session_id,
query_text=query,
limit=limit,
path_prefixes=path_prefixes,
exclude_path_prefixes=None,
exclude_like_patterns=None,
prefer_non_tests=True,
),
label="lexical retrieval",
rag_session_id=rag_session_id,
include_spans=include_spans,
exclude_tests=effective_exclude_tests,
path_prefixes=path_prefixes,
exc=exc,
missing_prefix="layer:C0 lexical_retrieval_failed",
)
if retry_result is not None:
return retry_result
return LayerRetrievalResult(items=[], missing=[self._failure_missing("layer:C0 lexical_retrieval_failed", exc)])
def _retry_without_test_filter(
self,
*,
operation: Callable[[], list[dict]],
label: str,
rag_session_id: str,
include_spans: bool,
exclude_tests: bool,
path_prefixes: list[str] | None,
exc: Exception,
missing_prefix: str,
layer: str | None = None,
) -> LayerRetrievalResult | None:
if not exclude_tests:
self._log_failure(
label=label,
rag_session_id=rag_session_id,
layer=layer,
exclude_tests=exclude_tests,
path_prefixes=path_prefixes,
exc=exc,
)
return None
self._log_failure(
label=label,
rag_session_id=rag_session_id,
layer=layer,
exclude_tests=exclude_tests,
path_prefixes=path_prefixes,
exc=exc,
retried_without_test_filter=True,
)
try:
rows = operation()
except Exception as retry_exc:
self._log_failure(
label=f"{label} retry",
rag_session_id=rag_session_id,
layer=layer,
exclude_tests=False,
path_prefixes=path_prefixes,
exc=retry_exc,
)
return None
result = self._success_result(
rows,
rag_session_id=rag_session_id,
label=f"{label} retry",
include_spans=include_spans,
layer=layer,
exclude_tests=False,
path_prefixes=path_prefixes,
)
result.missing.append(f"{missing_prefix}:retried_without_test_filter")
return result
def _success_result(
self,
rows: list[dict],
*,
rag_session_id: str,
label: str,
include_spans: bool,
exclude_tests: bool,
path_prefixes: list[str] | None,
layer: str | None = None,
) -> LayerRetrievalResult:
items = [self._to_item(row, include_spans=include_spans) for row in rows]
LOGGER.warning(
"%s: rag_session_id=%s layer=%s exclude_tests=%s path_prefixes=%s returned_count=%s top_paths=%s",
label,
rag_session_id,
layer,
exclude_tests,
path_prefixes or [],
len(items),
[item.source for item in items[:3]],
)
return LayerRetrievalResult(items=items)
def _log_failure(
self,
*,
label: str,
rag_session_id: str,
exclude_tests: bool,
path_prefixes: list[str] | None,
exc: Exception,
layer: str | None = None,
retried_without_test_filter: bool = False,
) -> None:
LOGGER.warning(
"%s failed: rag_session_id=%s layer=%s exclude_tests=%s path_prefixes=%s retried_without_test_filter=%s error=%s",
label,
rag_session_id,
layer,
exclude_tests,
path_prefixes or [],
retried_without_test_filter,
self._exception_summary(exc),
exc_info=True,
)
def _filter_args(self, exclude_tests: bool) -> dict[str, list[str] | None]:
test_filters = build_test_filters() if exclude_tests else None
return {
"exclude_path_prefixes": test_filters.exclude_path_prefixes if test_filters else None,
"exclude_like_patterns": test_filters.exclude_like_patterns if test_filters else None,
}
def _failure_missing(self, prefix: str, exc: Exception) -> str:
return f"{prefix}:{self._exception_summary(exc)}"
def _exception_summary(self, exc: Exception) -> str:
message = " ".join(str(exc).split())
if len(message) > 180:
message = message[:177] + "..."
return f"{type(exc).__name__}:{message or 'no_message'}"
def _to_item(self, row: dict, *, include_spans: bool) -> LayeredRetrievalItem:
location = None
if include_spans:
location = CodeLocation(
path=str(row.get("path") or ""),
start_line=row.get("span_start"),
end_line=row.get("span_end"),
)
return LayeredRetrievalItem(
source=str(row.get("path") or ""),
content=str(row.get("content") or ""),
layer=str(row.get("layer") or ""),
title=str(row.get("title") or ""),
metadata=dict(row.get("metadata", {}) or {}),
score=row.get("distance"),
location=location,
)

View File

@@ -0,0 +1,91 @@
from __future__ import annotations
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
class ExplainHints(BaseModel):
model_config = ConfigDict(extra="forbid")
paths: list[str] = Field(default_factory=list)
symbols: list[str] = Field(default_factory=list)
endpoints: list[str] = Field(default_factory=list)
commands: list[str] = Field(default_factory=list)
class ExplainIntent(BaseModel):
model_config = ConfigDict(extra="forbid")
raw_query: str
normalized_query: str
keywords: list[str] = Field(default_factory=list)
hints: ExplainHints = Field(default_factory=ExplainHints)
include_tests: bool = False
expected_entry_types: list[Literal["http", "cli"]] = Field(default_factory=list)
depth: Literal["high", "medium", "deep"] = "medium"
class CodeLocation(BaseModel):
model_config = ConfigDict(extra="forbid")
path: str
start_line: int | None = None
end_line: int | None = None
class LayeredRetrievalItem(BaseModel):
model_config = ConfigDict(extra="forbid")
source: str
content: str
layer: str
title: str
metadata: dict[str, Any] = Field(default_factory=dict)
score: float | None = None
location: CodeLocation | None = None
class TracePath(BaseModel):
model_config = ConfigDict(extra="forbid")
symbol_ids: list[str] = Field(default_factory=list)
score: float = 0.0
entrypoint_id: str | None = None
notes: list[str] = Field(default_factory=list)
class EvidenceItem(BaseModel):
model_config = ConfigDict(extra="forbid")
evidence_id: str
kind: Literal["entrypoint", "symbol", "edge", "excerpt"]
summary: str
location: CodeLocation | None = None
supports: list[str] = Field(default_factory=list)
class CodeExcerpt(BaseModel):
model_config = ConfigDict(extra="forbid")
evidence_id: str
symbol_id: str | None = None
title: str
path: str
start_line: int | None = None
end_line: int | None = None
content: str
focus: str = "overview"
class ExplainPack(BaseModel):
model_config = ConfigDict(extra="forbid")
intent: ExplainIntent
selected_entrypoints: list[LayeredRetrievalItem] = Field(default_factory=list)
seed_symbols: list[LayeredRetrievalItem] = Field(default_factory=list)
trace_paths: list[TracePath] = Field(default_factory=list)
evidence_index: dict[str, EvidenceItem] = Field(default_factory=dict)
code_excerpts: list[CodeExcerpt] = Field(default_factory=list)
missing: list[str] = Field(default_factory=list)
conflicts: list[str] = Field(default_factory=list)

View File

@@ -0,0 +1,328 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from app.modules.rag.contracts.enums import RagLayer
from app.modules.rag.explain.intent_builder import ExplainIntentBuilder
from app.modules.rag.explain.layered_gateway import LayerRetrievalResult, LayeredRetrievalGateway
from app.modules.rag.explain.models import CodeExcerpt, EvidenceItem, ExplainPack, LayeredRetrievalItem
from app.modules.rag.explain.source_excerpt_fetcher import SourceExcerptFetcher
from app.modules.rag.explain.trace_builder import TraceBuilder
from app.modules.rag.retrieval.test_filter import exclude_tests_default, is_test_path
LOGGER = logging.getLogger(__name__)
_MIN_EXCERPTS = 2
if TYPE_CHECKING:
from app.modules.rag.explain.graph_repository import CodeGraphRepository
from app.modules.rag.explain.models import ExplainIntent
class CodeExplainRetrieverV2:
def __init__(
self,
gateway: LayeredRetrievalGateway,
graph_repository: CodeGraphRepository,
intent_builder: ExplainIntentBuilder | None = None,
trace_builder: TraceBuilder | None = None,
excerpt_fetcher: SourceExcerptFetcher | None = None,
) -> None:
self._gateway = gateway
self._graph = graph_repository
self._intent_builder = intent_builder or ExplainIntentBuilder()
self._trace_builder = trace_builder or TraceBuilder(graph_repository)
self._excerpt_fetcher = excerpt_fetcher or SourceExcerptFetcher(graph_repository)
def build_pack(
self,
rag_session_id: str,
user_query: str,
*,
file_candidates: list[dict] | None = None,
) -> ExplainPack:
intent = self._intent_builder.build(user_query)
path_prefixes = _path_prefixes(intent, file_candidates or [])
exclude_tests = exclude_tests_default() and not intent.include_tests
pack = self._run_pass(rag_session_id, intent, path_prefixes, exclude_tests=exclude_tests)
if exclude_tests and len(pack.code_excerpts) < _MIN_EXCERPTS:
self._merge_test_fallback(pack, rag_session_id, intent, path_prefixes)
self._log_pack(rag_session_id, pack)
return pack
def _run_pass(
self,
rag_session_id: str,
intent: ExplainIntent,
path_prefixes: list[str],
*,
exclude_tests: bool,
) -> ExplainPack:
missing: list[str] = []
entrypoints_result = self._entrypoints(rag_session_id, intent, path_prefixes, exclude_tests=exclude_tests)
missing.extend(entrypoints_result.missing)
selected_entrypoints = self._filter_entrypoints(intent, entrypoints_result.items)
if not selected_entrypoints:
missing.append("layer:C3 empty")
seed_result = self._seed_symbols(rag_session_id, intent, path_prefixes, selected_entrypoints, exclude_tests=exclude_tests)
missing.extend(seed_result.missing)
seed_symbols = seed_result.items
if not seed_symbols:
missing.append("layer:C1 empty")
depth = 4 if intent.depth == "deep" else 3 if intent.depth == "medium" else 2
trace_paths = self._trace_builder.build_paths(rag_session_id, seed_symbols, max_depth=depth) if seed_symbols else []
excerpts, excerpt_evidence = self._excerpt_fetcher.fetch(rag_session_id, trace_paths) if trace_paths else ([], {})
if not excerpts:
lexical_result = self._gateway.retrieve_lexical_code(
rag_session_id,
intent.normalized_query,
limit=6,
path_prefixes=path_prefixes or None,
exclude_tests=exclude_tests,
include_spans=True,
)
missing.extend(lexical_result.missing)
excerpts, excerpt_evidence = _lexical_excerpts(lexical_result.items)
if not excerpts:
missing.append("layer:C0 empty")
evidence_index = _evidence_index(selected_entrypoints, seed_symbols)
evidence_index.update(excerpt_evidence)
missing.extend(_missing(selected_entrypoints, seed_symbols, trace_paths, excerpts))
return ExplainPack(
intent=intent,
selected_entrypoints=selected_entrypoints,
seed_symbols=seed_symbols,
trace_paths=trace_paths,
evidence_index=evidence_index,
code_excerpts=excerpts,
missing=_cleanup_missing(_dedupe(missing), has_excerpts=bool(excerpts)),
conflicts=[],
)
def _merge_test_fallback(
self,
pack: ExplainPack,
rag_session_id: str,
intent: ExplainIntent,
path_prefixes: list[str],
) -> None:
lexical_result = self._gateway.retrieve_lexical_code(
rag_session_id,
intent.normalized_query,
limit=6,
path_prefixes=path_prefixes or None,
exclude_tests=False,
include_spans=True,
)
excerpt_offset = len([key for key in pack.evidence_index if key.startswith("excerpt_")])
excerpts, evidence = _lexical_excerpts(
lexical_result.items,
start_index=excerpt_offset,
is_test_fallback=True,
)
if not excerpts:
pack.missing = _dedupe(pack.missing + lexical_result.missing)
return
seen = {(item.path, item.start_line, item.end_line, item.content) for item in pack.code_excerpts}
for excerpt in excerpts:
key = (excerpt.path, excerpt.start_line, excerpt.end_line, excerpt.content)
if key in seen:
continue
pack.code_excerpts.append(excerpt)
seen.add(key)
pack.evidence_index.update(evidence)
pack.missing = _cleanup_missing(_dedupe(pack.missing + lexical_result.missing), has_excerpts=bool(pack.code_excerpts))
def _entrypoints(
self,
rag_session_id: str,
intent: ExplainIntent,
path_prefixes: list[str],
*,
exclude_tests: bool,
) -> LayerRetrievalResult:
return self._gateway.retrieve_layer(
rag_session_id,
intent.normalized_query,
RagLayer.CODE_ENTRYPOINTS,
limit=6,
path_prefixes=path_prefixes or None,
exclude_tests=exclude_tests,
prefer_non_tests=True,
include_spans=True,
)
def _filter_entrypoints(self, intent: ExplainIntent, items: list[LayeredRetrievalItem]) -> list[LayeredRetrievalItem]:
if not intent.expected_entry_types:
return items[:3]
filtered = [item for item in items if str(item.metadata.get("entry_type") or "") in intent.expected_entry_types]
return filtered[:3] or items[:3]
def _seed_symbols(
self,
rag_session_id: str,
intent: ExplainIntent,
path_prefixes: list[str],
entrypoints: list[LayeredRetrievalItem],
*,
exclude_tests: bool,
) -> LayerRetrievalResult:
symbol_result = self._gateway.retrieve_layer(
rag_session_id,
intent.normalized_query,
RagLayer.CODE_SYMBOL_CATALOG,
limit=12,
path_prefixes=path_prefixes or None,
exclude_tests=exclude_tests,
prefer_non_tests=True,
include_spans=True,
)
handlers: list[LayeredRetrievalItem] = []
handler_ids = [str(item.metadata.get("handler_symbol_id") or "") for item in entrypoints]
if handler_ids:
handlers = self._graph.get_symbols_by_ids(rag_session_id, [item for item in handler_ids if item])
seeds: list[LayeredRetrievalItem] = []
seen: set[str] = set()
for item in handlers + symbol_result.items:
symbol_id = str(item.metadata.get("symbol_id") or "")
if not symbol_id or symbol_id in seen:
continue
seen.add(symbol_id)
seeds.append(item)
if len(seeds) >= 8:
break
return LayerRetrievalResult(items=seeds, missing=list(symbol_result.missing))
def _log_pack(self, rag_session_id: str, pack: ExplainPack) -> None:
prod_excerpt_count = len([excerpt for excerpt in pack.code_excerpts if not _is_test_excerpt(excerpt)])
test_excerpt_count = len(pack.code_excerpts) - prod_excerpt_count
LOGGER.warning(
"code explain pack: rag_session_id=%s entrypoints=%s seeds=%s paths=%s excerpts=%s prod_excerpt_count=%s test_excerpt_count=%s missing=%s",
rag_session_id,
len(pack.selected_entrypoints),
len(pack.seed_symbols),
len(pack.trace_paths),
len(pack.code_excerpts),
prod_excerpt_count,
test_excerpt_count,
pack.missing,
)
def _evidence_index(
entrypoints: list[LayeredRetrievalItem],
seed_symbols: list[LayeredRetrievalItem],
) -> dict[str, EvidenceItem]:
result: dict[str, EvidenceItem] = {}
for index, item in enumerate(entrypoints, start=1):
evidence_id = f"entrypoint_{index}"
result[evidence_id] = EvidenceItem(
evidence_id=evidence_id,
kind="entrypoint",
summary=item.title,
location=item.location,
supports=[str(item.metadata.get("handler_symbol_id") or "")],
)
for index, item in enumerate(seed_symbols, start=1):
evidence_id = f"symbol_{index}"
result[evidence_id] = EvidenceItem(
evidence_id=evidence_id,
kind="symbol",
summary=item.title,
location=item.location,
supports=[str(item.metadata.get("symbol_id") or "")],
)
return result
def _missing(
entrypoints: list[LayeredRetrievalItem],
seed_symbols: list[LayeredRetrievalItem],
trace_paths,
excerpts,
) -> list[str]:
missing: list[str] = []
if not entrypoints:
missing.append("entrypoints")
if not seed_symbols:
missing.append("seed_symbols")
if not trace_paths:
missing.append("trace_paths")
if not excerpts:
missing.append("code_excerpts")
return missing
def _lexical_excerpts(
items: list[LayeredRetrievalItem],
*,
start_index: int = 0,
is_test_fallback: bool = False,
) -> tuple[list[CodeExcerpt], dict[str, EvidenceItem]]:
excerpts: list[CodeExcerpt] = []
evidence_index: dict[str, EvidenceItem] = {}
for item in items:
evidence_id = f"excerpt_{start_index + len(evidence_index) + 1}"
location = item.location
evidence_index[evidence_id] = EvidenceItem(
evidence_id=evidence_id,
kind="excerpt",
summary=item.title or item.source,
location=location,
supports=[],
)
focus = "lexical"
if _item_is_test(item):
focus = "test:lexical"
elif is_test_fallback:
focus = "lexical"
excerpts.append(
CodeExcerpt(
evidence_id=evidence_id,
symbol_id=str(item.metadata.get("symbol_id") or "") or None,
title=item.title or item.source,
path=item.source,
start_line=location.start_line if location else None,
end_line=location.end_line if location else None,
content=item.content,
focus=focus,
)
)
return excerpts, evidence_index
def _item_is_test(item: LayeredRetrievalItem) -> bool:
return bool(item.metadata.get("is_test")) or is_test_path(item.source)
def _is_test_excerpt(excerpt: CodeExcerpt) -> bool:
return excerpt.focus.startswith("test:") or is_test_path(excerpt.path)
def _path_prefixes(intent: ExplainIntent, file_candidates: list[dict]) -> list[str]:
values: list[str] = []
for path in intent.hints.paths:
prefix = path.rsplit("/", 1)[0] if "/" in path else path
if prefix and prefix not in values:
values.append(prefix)
for item in file_candidates[:6]:
path = str(item.get("path") or "")
prefix = path.rsplit("/", 1)[0] if "/" in path else ""
if prefix and prefix not in values:
values.append(prefix)
return values
def _cleanup_missing(values: list[str], *, has_excerpts: bool) -> list[str]:
if not has_excerpts:
return values
return [value for value in values if value not in {"code_excerpts", "layer:C0 empty"}]
def _dedupe(values: list[str]) -> list[str]:
result: list[str] = []
for value in values:
item = value.strip()
if item and item not in result:
result.append(item)
return result

View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from app.modules.rag.explain.excerpt_planner import ExcerptPlanner
from app.modules.rag.explain.models import CodeExcerpt, EvidenceItem, TracePath
from app.modules.rag.retrieval.test_filter import is_test_path
if TYPE_CHECKING:
from app.modules.rag.explain.graph_repository import CodeGraphRepository
class SourceExcerptFetcher:
def __init__(self, graph_repository: CodeGraphRepository, planner: ExcerptPlanner | None = None) -> None:
self._graph = graph_repository
self._planner = planner or ExcerptPlanner()
def fetch(
self,
rag_session_id: str,
trace_paths: list[TracePath],
*,
max_excerpts: int = 40,
) -> tuple[list[CodeExcerpt], dict[str, EvidenceItem]]:
ordered_symbol_ids: list[str] = []
for path in trace_paths:
for symbol_id in path.symbol_ids:
if symbol_id and symbol_id not in ordered_symbol_ids:
ordered_symbol_ids.append(symbol_id)
chunks = self._graph.get_chunks_by_symbol_ids(rag_session_id, ordered_symbol_ids)
excerpts: list[CodeExcerpt] = []
evidence_index: dict[str, EvidenceItem] = {}
for chunk in chunks:
symbol_id = str(chunk.metadata.get("symbol_id") or "")
evidence_id = f"excerpt_{len(evidence_index) + 1}"
location = chunk.location
evidence_index[evidence_id] = EvidenceItem(
evidence_id=evidence_id,
kind="excerpt",
summary=chunk.title,
location=location,
supports=[symbol_id] if symbol_id else [],
)
is_test_chunk = bool(chunk.metadata.get("is_test")) or is_test_path(location.path if location else chunk.source)
for excerpt in self._planner.plan(chunk, evidence_id=evidence_id, symbol_id=symbol_id):
if len(excerpts) >= max_excerpts:
break
if is_test_chunk and not excerpt.focus.startswith("test:"):
excerpt.focus = f"test:{excerpt.focus}"
excerpts.append(excerpt)
if len(excerpts) >= max_excerpts:
break
return excerpts, evidence_index

View File

@@ -0,0 +1,102 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from app.modules.rag.explain.models import LayeredRetrievalItem, TracePath
if TYPE_CHECKING:
from app.modules.rag.explain.graph_repository import CodeGraphRepository
class TraceBuilder:
def __init__(self, graph_repository: CodeGraphRepository) -> None:
self._graph = graph_repository
def build_paths(
self,
rag_session_id: str,
seed_symbols: list[LayeredRetrievalItem],
*,
max_depth: int,
max_paths: int = 3,
edge_types: list[str] | None = None,
) -> list[TracePath]:
edges_filter = edge_types or ["calls", "imports", "inherits"]
symbol_map = self._symbol_map(seed_symbols)
paths: list[TracePath] = []
for seed in seed_symbols:
seed_id = str(seed.metadata.get("symbol_id") or "")
if not seed_id:
continue
queue: list[tuple[list[str], float, list[str]]] = [([seed_id], 0.0, [])]
while queue and len(paths) < max_paths * 3:
current_path, score, notes = queue.pop(0)
src_symbol_id = current_path[-1]
out_edges = self._graph.get_out_edges(rag_session_id, [src_symbol_id], edges_filter, limit_per_src=4)
if not out_edges or len(current_path) >= max_depth:
paths.append(TracePath(symbol_ids=current_path, score=score, notes=notes))
continue
for edge in out_edges:
metadata = edge.metadata
dst_symbol_id = str(metadata.get("dst_symbol_id") or "")
next_notes = list(notes)
next_score = score + self._edge_score(edge, symbol_map.get(src_symbol_id))
if not dst_symbol_id:
dst_ref = str(metadata.get("dst_ref") or "")
package_hint = self._package_hint(symbol_map.get(src_symbol_id))
resolved = self._graph.resolve_symbol_by_ref(rag_session_id, dst_ref, package_hint=package_hint)
if resolved is not None:
dst_symbol_id = str(resolved.metadata.get("symbol_id") or "")
symbol_map[dst_symbol_id] = resolved
next_score += 2.0
next_notes.append(f"resolved:{dst_ref}")
if not dst_symbol_id or dst_symbol_id in current_path:
paths.append(TracePath(symbol_ids=current_path, score=next_score, notes=next_notes))
continue
if dst_symbol_id not in symbol_map:
symbols = self._graph.get_symbols_by_ids(rag_session_id, [dst_symbol_id])
if symbols:
symbol_map[dst_symbol_id] = symbols[0]
queue.append((current_path + [dst_symbol_id], next_score, next_notes))
unique = self._unique_paths(paths)
unique.sort(key=lambda item: item.score, reverse=True)
return unique[:max_paths] or [TracePath(symbol_ids=[seed.metadata.get("symbol_id", "")], score=0.0) for seed in seed_symbols[:1]]
def _edge_score(self, edge: LayeredRetrievalItem, source_symbol: LayeredRetrievalItem | None) -> float:
metadata = edge.metadata
score = 1.0
if str(metadata.get("resolution") or "") == "resolved":
score += 2.0
source_path = source_symbol.source if source_symbol is not None else ""
if source_path and edge.source == source_path:
score += 1.0
if "tests/" in edge.source or "/tests/" in edge.source:
score -= 3.0
return score
def _package_hint(self, symbol: LayeredRetrievalItem | None) -> str | None:
if symbol is None:
return None
package = str(symbol.metadata.get("package_or_module") or "")
if not package:
return None
return ".".join(package.split(".")[:-1]) or package
def _symbol_map(self, items: list[LayeredRetrievalItem]) -> dict[str, LayeredRetrievalItem]:
result: dict[str, LayeredRetrievalItem] = {}
for item in items:
symbol_id = str(item.metadata.get("symbol_id") or "")
if symbol_id:
result[symbol_id] = item
return result
def _unique_paths(self, items: list[TracePath]) -> list[TracePath]:
result: list[TracePath] = []
seen: set[tuple[str, ...]] = set()
for item in items:
key = tuple(symbol_id for symbol_id in item.symbol_ids if symbol_id)
if not key or key in seen:
continue
seen.add(key)
result.append(item)
return result

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
from app.modules.rag.retrieval.test_filter import is_test_path
class CodeTextDocumentBuilder:
@@ -17,6 +18,7 @@ class CodeTextDocumentBuilder:
"chunk_index": chunk_index,
"chunk_type": chunk.chunk_type,
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
"is_test": is_test_path(source.path),
"artifact_type": "CODE",
},
)

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.edges.extractor import PyEdge
from app.modules.rag.retrieval.test_filter import is_test_path
class EdgeDocumentBuilder:
@@ -22,6 +23,7 @@ class EdgeDocumentBuilder:
"dst_symbol_id": edge.dst_symbol_id,
"dst_ref": edge.dst_ref,
"resolution": edge.resolution,
"is_test": is_test_path(source.path),
"lang_payload": edge.metadata,
"artifact_type": "CODE",
},

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
from app.modules.rag.retrieval.test_filter import is_test_path
class EntrypointDocumentBuilder:
@@ -19,6 +20,7 @@ class EntrypointDocumentBuilder:
"framework": entrypoint.framework,
"route_or_command": entrypoint.route_or_command,
"handler_symbol_id": entrypoint.handler_symbol_id,
"is_test": is_test_path(source.path),
"lang_payload": entrypoint.metadata,
"artifact_type": "CODE",
},

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
from app.modules.rag.retrieval.test_filter import is_test_path
class SymbolDocumentBuilder:
@@ -26,6 +27,7 @@ class SymbolDocumentBuilder:
"parent_symbol_id": symbol.parent_symbol_id,
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
"is_entry_candidate": bool(symbol.decorators),
"is_test": is_test_path(source.path),
"lang_payload": symbol.lang_payload,
"artifact_type": "CODE",
},

View File

@@ -0,0 +1,201 @@
# Intent Router Specification (MVP) — v1.1
Version: 1.1
Scope: Routing + query normalization + anchor extraction for layered RAG (CODE + DOCS)
---
## 1) Цель
Intent Router принимает:
- `user_query: string`
- `conversation_state: object`
- `repo_context: object` (язык/структура репо/доступные слои)
И возвращает:
- `intent`
- `graph_id`
- `conversation_mode`
- `query_plan` (нормализация + якоря)
- `retrieval_spec` (запрос по слоям RAG)
- `evidence_policy`
Router **не делает** retrieval и **не генерирует** ответ.
---
## 2) MVP интенты (строго 4)
- `CODE_QA` — объяснение/поиск по коду
- `DOCS_QA` — объяснение/поиск по документации
- `GENERATE_DOCS_FROM_CODE` — генерация документации по коду
- `PROJECT_MISC` — прочие вопросы по проекту
---
## 3) Диалоговый режим (контекст темы)
### 3.1 Политика
Router обязан сохранять intent в рамках темы.
- Если `conversation_state.active_intent` задан
- и нет явного сигнала смены темы
- то `intent = conversation_state.active_intent` и `conversation_mode = CONTINUE`
Смена intent допускается только если:
- есть явный сигнал смены домена/задачи, или
- новый запрос явно не соответствует текущему intent (жёсткое несоответствие)
---
## 4) Обязательная нормализация запроса и извлечение якорей
Router обязан выполнять:
### 4.1 Query normalization
Выход должен содержать:
- `raw` — исходный запрос
- `normalized` — каноническая, детерминированная и meaning-preserving форма `raw`
- `expansions[]` — добавочные токены для retrieval/rerank
- `keyword_hints[]` — компактные ключевые токены (символы/пути/доменные термины)
Требования:
- `raw` хранит исходную строку пользователя без изменений
- `normalized` строится **только** из `raw` и безопасных правил форматирования
- `normalized` не должен включать appended expansions, синонимы и догаданные keywords
- все enrichment должны жить только в `expansions[]`, `keyword_hints[]`, `anchors[]`
### 4.2 RU→EN mapping (минимальный словарь)
Router обязан поддерживать RU→EN mapping терминов только как `expansions`:
- `класс``class`
- `метод``method`
- `функция``function`, `def`
- `модуль``module`
- `пакет``package`
- `файл``file`
- `тест`, `юнит-тест``test`, `unit test`
Словарь должен быть расширяемым, но эти ключи обязательны.
### 4.3 Anchor extraction (якоря)
Router обязан извлекать **явные якоря** из user_query и conversation_state:
Типы якорей:
- `FILE_PATH` — путь/часть пути (`src/...`, `package/module.py`, `README.md`)
- `SYMBOL` — идентификатор (CamelCase, snake_case, dotted path)
- `DOC_REF` — ссылка на doc file/section (если есть явные маркеры)
- `KEY_TERM` — важные термины, влияющие на retrieval (класс/метод/функция и т.п.)
Каждый якорь должен возвращаться структурировано.
---
## 5) Контракт выхода Router
Top-level:
```json
{
"schema_version": "1.1",
"intent": "CODE_QA",
"graph_id": "CodeQAGraph",
"conversation_mode": "CONTINUE",
"query_plan": {
"raw": "",
"normalized": "",
"expansions": [],
"keyword_hints": [],
"anchors": []
},
"retrieval_spec": {
"domains": [],
"layer_queries": [],
"filters": {},
"rerank_profile": ""
},
"evidence_policy": {
"require_def": false,
"require_flow": false,
"require_spec": false,
"allow_answer_without_evidence": false
}
}
## 6) query_plan.anchors контракт
{
"type": "FILE_PATH | SYMBOL | DOC_REF | KEY_TERM",
"value": "string",
"subtype": "optional string",
"span": { "start": 0, "end": 0 },
"confidence": 0.0
}
Требования:
- FILE_PATH.value хранит путь как в запросе (без попытки “исправить”)
- SYMBOL.value хранит символ как в запросе (с сохранением регистра)
- KEY_TERM используется для выставления expected evidence и выбора слоёв
- anchors может быть пустым, но router должен пытаться извлечь их всегда
## 7) retrieval_spec контракт (слои + фильтры)
### 7.1 Структура
{
"domains": ["CODE", "DOCS"],
"layer_queries": [
{ "layer_id": "C1", "top_k": 30 },
{ "layer_id": "C3", "top_k": 15 }
],
"filters": {
"test_policy": "EXCLUDE",
"path_scope": [],
"language": []
},
"rerank_profile": "code"
}
## 7.2 Требования по intent
- CODE_QA → domains = ["CODE"], rerank_profile="code"
- DOCS_QA → domains = ["DOCS"], rerank_profile="docs"
- GENERATE_DOCS_FROM_CODE → domains = ["CODE"], rerank_profile="generate"
- PROJECT_MISC → domains = ["CODE","DOCS"], rerank_profile="project"
## 7.3 Требования по якорям
- Если найден FILE_PATH → router обязан добавить filters.path_scope (минимум: этот путь/директория)
- Если найден SYMBOL → router обязан добавить SYMBOL в query_plan.keyword_hints и query_plan.expansions (при необходимости)
- Если найден KEY_TERM (например "класс") → router обязан добавить RU→EN expansions
## 8) evidence_policy (минимальные требования)
{
"require_def": true,
"require_flow": true,
"require_spec": false,
"allow_answer_without_evidence": false
}
Требования:
- CODE_QA: require_def=true; require_flow=true
- DOCS_QA: require_spec=true
- GENERATE_DOCS_FROM_CODE: require_def=true
- PROJECT_MISC: allow_answer_without_evidence=true
## 9) Минимально обязательные поля (строго)
Router обязан всегда возвращать:
- intent
- graph_id
- conversation_mode
- query_plan.raw
- query_plan.normalized
- query_plan.expansions
- query_plan.anchors
- retrieval_spec.domains
- retrieval_spec.layer_queries
- retrieval_spec.filters.test_policy
- retrieval_spec.rerank_profile
- evidence_policy.*

View File

@@ -0,0 +1,23 @@
from app.modules.rag.intent_router_v2.factory import GigaChatIntentRouterFactory
from app.modules.rag.intent_router_v2.local_runner import IntentRouterScenarioRunner
from app.modules.rag.intent_router_v2.models import (
ConversationState,
IntentDecision,
IntentRouterResult,
QueryAnchor,
QueryPlan,
RepoContext,
)
from app.modules.rag.intent_router_v2.router import IntentRouterV2
__all__ = [
"ConversationState",
"GigaChatIntentRouterFactory",
"IntentDecision",
"IntentRouterResult",
"IntentRouterScenarioRunner",
"IntentRouterV2",
"QueryAnchor",
"QueryPlan",
"RepoContext",
]

View File

@@ -0,0 +1,144 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.models import AnchorSpan, QueryAnchor
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
_FILE_PATTERN = re.compile(r"(?P<value>\b(?:[\w.-]+/)*[\w.-]+\.(?:py|md|rst|txt|yaml|yml|json|toml|ini|cfg)\b)")
_PATH_HINT_PATTERN = re.compile(r"(?P<value>\b(?:src|app|docs|tests)/[\w./-]*[\w-]\b)")
_SYMBOL_PATTERN = re.compile(
r"\b(?P<value>[A-Z][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b"
)
_DOC_SECTION_PATTERN = re.compile(r"(?:section|раздел)\s+[\"'`#]?(?P<value>[A-Za-zА-Яа-я0-9_ ./:-]{2,})", re.IGNORECASE)
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
class AnchorExtractor:
def __init__(
self,
mapper: RuEnTermMapper | None = None,
canonicalizer: KeyTermCanonicalizer | None = None,
) -> None:
self._mapper = mapper or RuEnTermMapper()
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
def extract(self, text: str) -> list[QueryAnchor]:
anchors = self._file_anchors(text)
anchors.extend(self._symbol_anchors(text, file_anchors=anchors))
anchors.extend(self._doc_ref_anchors(text))
anchors.extend(self._key_term_anchors(text))
return self._dedupe(anchors)
def _file_anchors(self, text: str) -> list[QueryAnchor]:
anchors = self._anchors_from_matches(_FILE_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.95)
anchors.extend(self._anchors_from_matches(_PATH_HINT_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.8))
return anchors
def _symbol_anchors(self, text: str, *, file_anchors: list[QueryAnchor]) -> list[QueryAnchor]:
anchors: list[QueryAnchor] = []
path_ranges = [(anchor.span.start, anchor.span.end) for anchor in file_anchors if anchor.span is not None]
path_segments = self._path_segments(file_anchors)
for match in _SYMBOL_PATTERN.finditer(text):
value = match.group("value")
if value.endswith((".py", ".md")) or "/" in value:
continue
if self._is_inside_path(match.start("value"), match.end("value"), path_ranges):
continue
if self._is_keyword(value):
continue
if file_anchors and value.lower() in path_segments:
continue
anchors.append(self._anchor("SYMBOL", value, match.start("value"), match.end("value"), 0.88, source="user_text"))
return anchors
def _doc_ref_anchors(self, text: str) -> list[QueryAnchor]:
anchors = self._anchors_from_matches(_DOC_SECTION_PATTERN.finditer(text), anchor_type="DOC_REF", confidence=0.75, subtype="section")
for match in _FILE_PATTERN.finditer(text):
value = match.group("value")
if not value.lower().endswith((".md", ".rst", ".txt")):
continue
anchors.append(self._anchor("DOC_REF", value, match.start("value"), match.end("value"), 0.92, subtype="file", source="user_text"))
return anchors
def _key_term_anchors(self, text: str) -> list[QueryAnchor]:
literals = set(self._mapper.all_literal_terms())
anchors: list[QueryAnchor] = []
for token in _WORD_RE.finditer(text):
value = token.group(0)
normalized = value.lower()
canonical = self._canonicalizer.canonicalize(value)
if canonical is None and normalized not in literals:
continue
anchors.append(
self._anchor(
"KEY_TERM",
canonical or value,
token.start(),
token.end(),
0.9,
source="user_text",
)
)
return anchors
def _anchors_from_matches(
self,
matches,
*,
anchor_type: str,
confidence: float,
subtype: str | None = None,
) -> list[QueryAnchor]:
return [
self._anchor(anchor_type, match.group("value"), match.start("value"), match.end("value"), confidence, subtype=subtype)
for match in matches
]
def _anchor(
self,
anchor_type: str,
value: str,
start: int,
end: int,
confidence: float,
subtype: str | None = None,
source: str = "user_text",
) -> QueryAnchor:
return QueryAnchor(
type=anchor_type,
value=value,
subtype=subtype,
source=source,
span=AnchorSpan(start=start, end=end),
confidence=confidence,
)
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
seen: set[tuple[str, str, str | None, str]] = set()
for anchor in anchors:
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
if key in seen:
continue
seen.add(key)
result.append(anchor)
return result
def _is_inside_path(self, start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
return any(start >= left and end <= right for left, right in ranges)
def _is_keyword(self, token: str) -> bool:
return token.lower() in PY_KEYWORDS
def _path_segments(self, anchors: list[QueryAnchor]) -> set[str]:
values: set[str] = set()
for anchor in anchors:
parts = re.split(r"[/.]+", anchor.value.lower())
for part in parts:
if not part:
continue
values.add(part)
return values | COMMON_PATH_SEGMENTS

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import QueryAnchor
class AnchorSpanValidator:
def sanitize(self, anchors: list[QueryAnchor], raw_len: int) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
for anchor in anchors:
if anchor.source != "user_text":
result.append(anchor.model_copy(update={"span": None}))
continue
if anchor.span is None:
result.append(anchor)
continue
start = int(anchor.span.start)
end = int(anchor.span.end)
if 0 <= start < end <= raw_len:
result.append(anchor)
continue
result.append(anchor.model_copy(update={"span": None, "confidence": max(anchor.confidence * 0.5, 0.0)}))
return result

View File

@@ -0,0 +1,113 @@
from __future__ import annotations
import json
import re
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
from app.modules.rag.intent_router_v2.protocols import TextGenerator
from app.modules.rag.intent_router_v2.test_signals import has_test_focus
_CODE_FILE_PATH_RE = re.compile(
r"\b(?:[\w.-]+/)*[\w.-]+\.(?:py|js|jsx|ts|tsx|java|kt|go|rb|php|c|cc|cpp|h|hpp|cs|swift|rs)(?!\w)\b",
re.IGNORECASE,
)
class IntentClassifierV2:
_GENERATE_DOCS_MARKERS = (
"сгенерируй документац",
"подготовь документац",
"создай документац",
"генерац",
"generate documentation",
"write documentation",
)
_DOCS_MARKERS = ("документац", "readme", "docs/", ".md", "spec", "runbook", "markdown")
_CODE_MARKERS = ("по коду", "код", "класс", "метод", "функц", "модул", "пакет", "файл", "block", "блок", "handler", "endpoint")
def __init__(self, llm: TextGenerator | None = None) -> None:
self._llm = llm
def classify(self, user_query: str, conversation_state: ConversationState) -> IntentDecision:
deterministic = self._deterministic(user_query)
if deterministic:
return deterministic
llm_decision = self._classify_with_llm(user_query, conversation_state)
if llm_decision:
return llm_decision
return IntentDecision(intent="PROJECT_MISC", confidence=0.55, reason="fallback_project_misc")
def _deterministic(self, user_query: str) -> IntentDecision | None:
text = " ".join((user_query or "").lower().split())
if any(marker in text for marker in self._GENERATE_DOCS_MARKERS):
return IntentDecision(intent="GENERATE_DOCS_FROM_CODE", confidence=0.97, reason="deterministic_generate_docs")
if self._looks_like_docs_question(text):
return IntentDecision(intent="DOCS_QA", confidence=0.9, reason="deterministic_docs")
if self._looks_like_code_question(user_query, text):
return IntentDecision(intent="CODE_QA", confidence=0.9, reason="deterministic_code")
return None
def _classify_with_llm(self, user_query: str, conversation_state: ConversationState) -> IntentDecision | None:
if self._llm is None:
return None
payload = json.dumps(
{
"message": user_query,
"active_intent": conversation_state.active_intent,
"last_query": conversation_state.last_query,
"allowed_intents": ["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"],
},
ensure_ascii=False,
)
try:
raw = self._llm.generate("rag_intent_router_v2", payload, log_context="rag.intent_router_v2.classify").strip()
except Exception:
return None
parsed = self._parse(raw)
if parsed is None:
return None
return parsed
def _parse(self, raw: str) -> IntentDecision | None:
candidate = self._strip_code_fence(raw)
try:
payload = json.loads(candidate)
except json.JSONDecodeError:
return None
intent = str(payload.get("intent") or "").strip().upper()
if intent not in {"CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"}:
return None
return IntentDecision(
intent=intent,
confidence=float(payload.get("confidence") or 0.7),
reason=str(payload.get("reason") or "llm").strip() or "llm",
)
def _strip_code_fence(self, text: str) -> str:
if not text.startswith("```"):
return text
lines = text.splitlines()
if len(lines) < 3 or lines[-1].strip() != "```":
return text
return "\n".join(lines[1:-1]).strip()
def _looks_like_docs_question(self, text: str) -> bool:
if self._has_code_file_path(text):
return False
return any(marker in text for marker in self._DOCS_MARKERS)
def _looks_like_code_question(self, raw_text: str, lowered: str) -> bool:
if self._has_code_file_path(raw_text):
return True
if has_test_focus(lowered):
return True
if any(marker in lowered for marker in self._DOCS_MARKERS) and not any(marker in lowered for marker in self._CODE_MARKERS):
return False
if any(marker in lowered for marker in self._CODE_MARKERS):
return True
if re.search(r"\b[A-Z][A-Za-z0-9_]{2,}(?:\.[A-Za-z_][A-Za-z0-9_]*)*\b", raw_text or ""):
return True
return bool(re.search(r"\b[a-z_][A-Za-z0-9_]{2,}\(", raw_text or ""))
def _has_code_file_path(self, text: str) -> bool:
return bool(_CODE_FILE_PATH_RE.search(text or ""))

View File

@@ -0,0 +1,61 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.followup_detector import FollowUpDetector
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor
class ConversationAnchorBuilder:
def __init__(self, followup: FollowUpDetector | None = None) -> None:
self._followup = followup or FollowUpDetector()
def build(
self,
raw: str,
state: ConversationState,
*,
continue_mode: bool,
has_user_symbol: bool,
has_user_file_path: bool,
) -> list[QueryAnchor]:
if not continue_mode:
return []
anchors: list[QueryAnchor] = []
if has_user_file_path:
return anchors
for path in self._paths_for_carryover(state.active_path_scope):
anchors.append(
QueryAnchor(
type="FILE_PATH",
value=path,
source="conversation_state",
span=None,
confidence=0.6,
)
)
if has_user_symbol:
return anchors
if not self._followup.is_follow_up(raw):
return anchors
symbol = state.active_symbol or (state.active_code_span_symbols[0] if state.active_code_span_symbols else None)
if symbol:
anchors.append(
QueryAnchor(
type="SYMBOL",
value=symbol,
source="conversation_state",
span=None,
confidence=0.64,
)
)
return anchors
def _paths_for_carryover(self, active_path_scope: list[str]) -> list[str]:
paths = list(active_path_scope or [])
file_paths = [path for path in paths if self._looks_like_file(path)]
if file_paths:
return file_paths[:1]
return paths[:1]
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
class ConversationPolicy:
_SWITCH_MARKERS = (
"теперь",
"а теперь",
"давай теперь",
"переключ",
"new task",
"switch to",
"instead",
)
_DOCS_SIGNALS = ("документац", "readme", "docs/", ".md")
_CODE_SIGNALS = ("по коду", "класс", "метод", "файл", "блок кода", "function", "class")
def resolve(self, decision: IntentDecision, user_query: str, conversation_state: ConversationState) -> tuple[str, str]:
active_intent = conversation_state.active_intent
if active_intent is None:
return decision.intent, "START"
if active_intent == decision.intent:
return active_intent, "CONTINUE"
if self._has_explicit_switch(user_query):
return decision.intent, "SWITCH"
if self._is_hard_mismatch(active_intent, decision.intent, user_query):
return decision.intent, "SWITCH"
return active_intent, "CONTINUE"
def _has_explicit_switch(self, user_query: str) -> bool:
text = " ".join((user_query or "").lower().split())
return any(marker in text for marker in self._SWITCH_MARKERS)
def _is_hard_mismatch(self, active_intent: str, candidate_intent: str, user_query: str) -> bool:
if active_intent == candidate_intent:
return False
text = " ".join((user_query or "").lower().split())
if candidate_intent == "GENERATE_DOCS_FROM_CODE":
return True
if candidate_intent == "DOCS_QA":
return any(signal in text for signal in self._DOCS_SIGNALS)
if candidate_intent == "CODE_QA" and active_intent == "DOCS_QA":
return any(signal in text for signal in self._CODE_SIGNALS)
return False

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import EvidencePolicy
class EvidencePolicyFactory:
def build(
self,
intent: str,
*,
sub_intent: str = "EXPLAIN",
negations: list[str] | None = None,
has_user_anchor: bool = True,
) -> EvidencePolicy:
negations_set = set(negations or [])
if intent == "CODE_QA":
if sub_intent == "OPEN_FILE":
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
if sub_intent == "EXPLAIN_LOCAL":
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
if "tests" in negations_set and not has_user_anchor:
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
return EvidencePolicy(require_def=True, require_flow=True, require_spec=False, allow_answer_without_evidence=False)
if intent == "DOCS_QA":
return EvidencePolicy(require_def=False, require_flow=False, require_spec=True, allow_answer_without_evidence=False)
if intent == "GENERATE_DOCS_FROM_CODE":
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=True)

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from app.modules.agent.llm import AgentLlmService
from app.modules.agent.prompt_loader import PromptLoader
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
from app.modules.rag.intent_router_v2.router import IntentRouterV2
from app.modules.shared.env_loader import load_workspace_env
from app.modules.shared.gigachat.client import GigaChatClient
from app.modules.shared.gigachat.settings import GigaChatSettings
from app.modules.shared.gigachat.token_provider import GigaChatTokenProvider
class GigaChatIntentRouterFactory:
def build(self) -> IntentRouterV2:
load_workspace_env()
settings = GigaChatSettings.from_env()
token_provider = GigaChatTokenProvider(settings)
client = GigaChatClient(settings, token_provider)
prompt_loader = PromptLoader()
llm = AgentLlmService(client=client, prompts=prompt_loader)
classifier = IntentClassifierV2(llm=llm)
return IntentRouterV2(classifier=classifier)

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
class FollowUpDetector:
_MARKERS = (
"что дальше",
"почему",
"зачем",
"а что",
"уточни",
"подробнее",
"как именно",
"покажи подробнее",
)
def is_follow_up(self, raw: str) -> bool:
text = " ".join((raw or "").lower().split())
if not text:
return False
if len(text.split()) <= 4:
return True
return any(marker in text for marker in self._MARKERS)

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
class GraphIdResolver:
_GRAPH_MAP = {
"CODE_QA": "CodeQAGraph",
"DOCS_QA": "DocsQAGraph",
"GENERATE_DOCS_FROM_CODE": "GenerateDocsFromCodeGraph",
"PROJECT_MISC": "ProjectMiscGraph",
}
def resolve(self, intent: str) -> str:
return self._GRAPH_MAP[intent]

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.normalization import FILE_PATH_RE
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
_IDENTIFIER_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
class KeywordHintBuilder:
def build(self, text: str) -> list[str]:
hints: list[str] = []
path_segments = self._path_segments(text)
for token in _IDENTIFIER_RE.findall(text or ""):
if token.lower() in PY_KEYWORDS:
continue
if token.lower() in path_segments:
continue
if token not in hints:
hints.append(token)
for match in FILE_PATH_RE.finditer(text or ""):
candidate = match.group(0).lower()
if candidate not in hints:
hints.append(candidate)
return hints[:12]
def _path_segments(self, text: str) -> set[str]:
values: set[str] = set(COMMON_PATH_SEGMENTS)
for match in FILE_PATH_RE.finditer(text or ""):
for part in re.split(r"[/.]+", match.group(0).lower()):
if part:
values.add(part)
return values

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import QueryAnchor
class KeywordHintSanitizer:
_GENERIC_KEY_TERMS = {"файл", "класс", "метод", "функция", "документация"}
_DOMAIN_ALLOWLIST = {"RAG", "API", "HTTP", "SQL"}
_DIR_SCOPE_MARKERS = ("в папке", "в директории", "в каталоге")
def sanitize(self, raw: str, anchors: list[QueryAnchor], base_hints: list[str]) -> list[str]:
text = (raw or "").lower()
allow_dirs = any(marker in text for marker in self._DIR_SCOPE_MARKERS)
file_paths = [anchor.value for anchor in anchors if anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value)]
known_dirs = {path.rsplit("/", 1)[0] for path in file_paths if "/" in path}
result: list[str] = []
for anchor in anchors:
if anchor.type == "FILE_PATH":
if self._looks_like_directory(anchor.value):
if not allow_dirs and (known_dirs or file_paths):
continue
self._append(result, anchor.value)
if anchor.type == "SYMBOL":
self._append(result, anchor.value)
for token in base_hints:
if token in self._DOMAIN_ALLOWLIST:
self._append(result, token)
continue
lowered = token.lower()
if lowered in self._GENERIC_KEY_TERMS:
continue
if token in known_dirs and not allow_dirs:
continue
if "/" in token and "." not in token and not allow_dirs and file_paths:
continue
self._append(result, token)
return result[:8]
def _append(self, values: list[str], candidate: str) -> None:
if candidate and candidate not in values:
values.append(candidate)
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail
def _looks_like_directory(self, value: str) -> bool:
return "/" in (value or "") and not self._looks_like_file(value)

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import LayerQuery, RepoContext
class LayerQueryBuilder:
def build(self, intent: str, repo_context: RepoContext, *, domains: list[str], layers_map: dict[str, list[tuple[str, int]]]) -> list[LayerQuery]:
available = set(repo_context.available_layers or [])
result: list[LayerQuery] = []
for layer_id, top_k in layers_map[intent]:
if not self._layer_matches_domains(layer_id, domains):
continue
if available and layer_id not in available:
continue
result.append(LayerQuery(layer_id=layer_id, top_k=top_k))
if result:
return result
return [
LayerQuery(layer_id=layer_id, top_k=top_k)
for layer_id, top_k in layers_map[intent]
if self._layer_matches_domains(layer_id, domains)
]
def _layer_matches_domains(self, layer_id: str, domains: list[str]) -> bool:
if domains == ["CODE"]:
return layer_id.startswith("C")
if domains == ["DOCS"]:
return layer_id.startswith("D")
return layer_id.startswith("C") or layer_id.startswith("D")

View File

@@ -0,0 +1,25 @@
from __future__ import annotations
import logging
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
from app.modules.rag.intent_router_v2.router import IntentRouterV2
LOGGER = logging.getLogger(__name__)
class IntentRouterScenarioRunner:
def __init__(self, router: IntentRouterV2) -> None:
self._router = router
def run(self, queries: list[str], repo_context: RepoContext | None = None) -> list[IntentRouterResult]:
state = ConversationState()
context = repo_context or RepoContext()
results: list[IntentRouterResult] = []
for index, user_query in enumerate(queries, start=1):
LOGGER.warning("intent router local input: turn=%s user_query=%s", index, user_query)
result = self._router.route(user_query, state, context)
LOGGER.warning("intent router local output: turn=%s result=%s", index, result.model_dump_json(ensure_ascii=False))
results.append(result)
state = state.advance(result)
return results

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
import logging
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
LOGGER = logging.getLogger(__name__)
class IntentRouterLogger:
def log_request(self, user_query: str, conversation_state: ConversationState, repo_context: RepoContext) -> None:
LOGGER.warning(
"intent router v2 request: turn=%s active_intent=%s user_query=%s languages=%s domains=%s",
conversation_state.turn_index + 1,
conversation_state.active_intent,
" ".join((user_query or "").split()),
repo_context.languages,
repo_context.available_domains,
)
def log_result(self, result: IntentRouterResult) -> None:
LOGGER.warning("intent router v2 result: %s", result.model_dump_json(ensure_ascii=False))

View File

@@ -0,0 +1,182 @@
from __future__ import annotations
import re
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
IntentType = Literal["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"]
ConversationMode = Literal["START", "CONTINUE", "SWITCH"]
AnchorType = Literal["FILE_PATH", "SYMBOL", "DOC_REF", "KEY_TERM"]
AnchorSource = Literal["user_text", "conversation_state", "heuristic"]
_INLINE_CODE_RE = re.compile(r"`([^`]*)`")
_CODE_SYMBOL_RE = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b")
class AnchorSpan(BaseModel):
model_config = ConfigDict(extra="forbid")
start: int = 0
end: int = 0
class QueryAnchor(BaseModel):
model_config = ConfigDict(extra="forbid")
type: AnchorType
value: str
source: AnchorSource = "user_text"
subtype: str | None = None
span: AnchorSpan | None = None
confidence: float = 0.0
@field_validator("confidence")
@classmethod
def clamp_confidence(cls, value: float) -> float:
return max(0.0, min(1.0, float(value)))
class QueryPlan(BaseModel):
model_config = ConfigDict(extra="forbid")
raw: str
normalized: str
sub_intent: str = "EXPLAIN"
negations: list[str] = Field(default_factory=list)
expansions: list[str] = Field(default_factory=list)
keyword_hints: list[str] = Field(default_factory=list)
anchors: list[QueryAnchor] = Field(default_factory=list)
class LayerQuery(BaseModel):
model_config = ConfigDict(extra="forbid")
layer_id: str
top_k: int
class CodeRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
test_policy: str = "EXCLUDE"
path_scope: list[str] = Field(default_factory=list)
language: list[str] = Field(default_factory=list)
class DocsRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
path_scope: list[str] = Field(default_factory=list)
doc_kinds: list[str] = Field(default_factory=list)
doc_language: list[str] = Field(default_factory=list)
class HybridRetrievalFilters(BaseModel):
model_config = ConfigDict(extra="forbid")
test_policy: str = "EXCLUDE"
path_scope: list[str] = Field(default_factory=list)
language: list[str] = Field(default_factory=list)
doc_kinds: list[str] = Field(default_factory=list)
doc_language: list[str] = Field(default_factory=list)
class RetrievalSpec(BaseModel):
model_config = ConfigDict(extra="forbid")
domains: list[str] = Field(default_factory=list)
layer_queries: list[LayerQuery] = Field(default_factory=list)
filters: CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters = Field(default_factory=CodeRetrievalFilters)
rerank_profile: str = ""
class EvidencePolicy(BaseModel):
model_config = ConfigDict(extra="forbid")
require_def: bool = False
require_flow: bool = False
require_spec: bool = False
allow_answer_without_evidence: bool = False
class IntentRouterResult(BaseModel):
model_config = ConfigDict(extra="forbid")
schema_version: str = "1.1"
intent: IntentType
graph_id: str
conversation_mode: ConversationMode
query_plan: QueryPlan
retrieval_spec: RetrievalSpec
evidence_policy: EvidencePolicy
class ConversationState(BaseModel):
model_config = ConfigDict(extra="forbid")
active_intent: IntentType | None = None
active_domain: str | None = None
active_anchors: list[QueryAnchor] = Field(default_factory=list)
active_symbol: str | None = None
active_path_scope: list[str] = Field(default_factory=list)
active_code_span_symbols: list[str] = Field(default_factory=list)
last_query: str = ""
turn_index: int = 0
def advance(self, result: IntentRouterResult) -> "ConversationState":
user_anchors = [anchor for anchor in result.query_plan.anchors if anchor.source == "user_text"]
symbol_candidates = [anchor.value for anchor in user_anchors if anchor.type == "SYMBOL"]
has_user_file_anchor = any(anchor.type == "FILE_PATH" for anchor in user_anchors)
if symbol_candidates:
active_symbol = symbol_candidates[-1]
elif has_user_file_anchor:
active_symbol = None
else:
active_symbol = self.active_symbol
raw_code_symbols = _extract_code_symbols(result.query_plan.raw)
active_code_span_symbols = raw_code_symbols or list(self.active_code_span_symbols)
path_scope = list(getattr(result.retrieval_spec.filters, "path_scope", []) or [])
active_domains = list(result.retrieval_spec.domains or [])
active_domain = active_domains[0] if len(active_domains) == 1 else self.active_domain
return ConversationState(
active_intent=result.intent,
active_domain=active_domain,
active_anchors=list(user_anchors),
active_symbol=active_symbol,
active_path_scope=path_scope or list(self.active_path_scope),
active_code_span_symbols=active_code_span_symbols,
last_query=result.query_plan.raw,
turn_index=self.turn_index + 1,
)
class RepoContext(BaseModel):
model_config = ConfigDict(extra="forbid")
languages: list[str] = Field(default_factory=list)
available_domains: list[str] = Field(default_factory=lambda: ["CODE", "DOCS"])
available_layers: list[str] = Field(default_factory=list)
class IntentDecision(BaseModel):
model_config = ConfigDict(extra="forbid")
intent: IntentType
confidence: float = 0.0
reason: str = ""
@field_validator("confidence")
@classmethod
def clamp_confidence(cls, value: float) -> float:
return max(0.0, min(1.0, float(value)))
def _extract_code_symbols(raw: str) -> list[str]:
symbols: list[str] = []
for match in _INLINE_CODE_RE.finditer(raw or ""):
snippet = match.group(1)
for token in _CODE_SYMBOL_RE.findall(snippet):
if token not in symbols:
symbols.append(token)
return symbols[:8]

View File

@@ -0,0 +1,17 @@
from __future__ import annotations
import re
_TEST_NEG_RE = re.compile(
r"(?:не\s+про\s+тест|без\s+тест|кроме\s+тест|про\s+прод\s+код|только\s+прод|production\s+code)",
re.IGNORECASE,
)
class NegationDetector:
def detect(self, text: str) -> set[str]:
lowered = (text or "").lower()
negations: set[str] = set()
if _TEST_NEG_RE.search(lowered):
negations.add("tests")
return negations

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import re
CODE_SPAN_RE = re.compile(r"`[^`]*`")
FILE_PATH_RE = re.compile(
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
re.IGNORECASE,
)
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
WS_RE = re.compile(r"\s+")
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "": '"', "": '"', "": "'", "": "'"})
class QueryNormalizer:
"""Builds a deterministic canonical form from raw text without enrichment."""
def normalize(self, raw: str) -> str:
text = raw or ""
protected = _ProtectedText()
text = self._protect(text, protected)
text = self._collapse_whitespace(text)
text = text.translate(QUOTE_TRANSLATION)
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
text = self._collapse_whitespace(text)
return protected.restore(text)
def _protect(self, text: str, protected: "_ProtectedText") -> str:
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
text = pattern.sub(protected.replace, text)
return text
def _collapse_whitespace(self, text: str) -> str:
return WS_RE.sub(" ", text).strip()
class _ProtectedText:
def __init__(self) -> None:
self._items: dict[str, str] = {}
self._index = 0
def replace(self, match: re.Match[str]) -> str:
placeholder = f"@@P{self._index}@@"
self._items[placeholder] = match.group(0)
self._index += 1
return placeholder
def restore(self, text: str) -> str:
restored = text
for placeholder, value in self._items.items():
restored = restored.replace(placeholder, value)
return restored

View File

@@ -0,0 +1,48 @@
from __future__ import annotations
class KeyTermCanonicalizer:
_ALIASES: dict[str, set[str]] = {
"файл": {
"файл",
"файла",
"файле",
"файлу",
"файлом",
"файлы",
"файлов",
"файлам",
"файлами",
},
"класс": {"класс", "класса", "классе", "классу", "классом", "классы", "классов", "классам"},
"функция": {"функция", "функции", "функцию", "функцией", "функциях"},
"метод": {"метод", "метода", "методе", "методу", "методом", "методы"},
"документация": {"документация", "документации", "документацию"},
"тест": {"тест", "тесты", "тестов", "тестам", "тестами", "юнит-тест", "юниттест"},
"модуль": {"модуль", "модуля"},
"пакет": {"пакет"},
}
def __init__(self) -> None:
self._token_to_canonical = self._build_index()
def canonicalize(self, token: str) -> str | None:
return self._token_to_canonical.get((token or "").lower())
def aliases(self) -> set[str]:
values: set[str] = set()
for forms in self._ALIASES.values():
values.update(forms)
return values
def is_test_term(self, token: str) -> bool:
canonical = self.canonicalize(token)
return canonical == "тест"
def _build_index(self) -> dict[str, str]:
index: dict[str, str] = {}
for canonical, forms in self._ALIASES.items():
index[canonical] = canonical
for form in forms:
index[form] = canonical
return index

View File

@@ -0,0 +1,7 @@
from __future__ import annotations
from typing import Protocol
class TextGenerator(Protocol):
def generate(self, prompt_name: str, user_input: str, *, log_context: str | None = None) -> str: ...

View File

@@ -0,0 +1,3 @@
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
__all__ = ["QueryNormalizer"]

View File

@@ -0,0 +1,223 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
class QueryPlanBuilder:
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
def __init__(
self,
normalizer: QueryNormalizer | None = None,
extractor: AnchorExtractor | None = None,
mapper: RuEnTermMapper | None = None,
keyword_hints: KeywordHintBuilder | None = None,
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
carryover: ConversationAnchorBuilder | None = None,
span_validator: AnchorSpanValidator | None = None,
sub_intent_detector: SubIntentDetector | None = None,
negation_detector: NegationDetector | None = None,
) -> None:
self._normalizer = normalizer or QueryNormalizer()
self._extractor = extractor or AnchorExtractor()
self._mapper = mapper or RuEnTermMapper()
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
self._carryover = carryover or ConversationAnchorBuilder()
self._span_validator = span_validator or AnchorSpanValidator()
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
self._negation_detector = negation_detector or NegationDetector()
def build(
self,
user_query: str,
conversation_state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str = "START",
intent: str = "PROJECT_MISC",
) -> QueryPlan:
raw = user_query or ""
normalized = self._normalizer.normalize(raw)
if not normalized and raw.strip():
normalized = raw
negations = self._negation_detector.detect(normalized)
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
merged_anchors = self._merge_anchors(
raw,
user_anchors,
conversation_state,
continue_mode,
conversation_mode=conversation_mode,
intent=intent,
)
skip_tests = "tests" in negations or is_negative_test_request(raw)
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
if intent == "DOCS_QA":
sub_intent = "EXPLAIN"
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
keyword_hints = self._keyword_hints(
raw,
normalized,
cleaned_anchors,
skip_tests=skip_tests,
intent=intent,
state=conversation_state,
)
return QueryPlan(
raw=raw,
normalized=normalized,
sub_intent=sub_intent,
negations=sorted(negations),
expansions=expansions,
keyword_hints=keyword_hints,
anchors=cleaned_anchors,
)
def _merge_anchors(
self,
raw: str,
anchors: list[QueryAnchor],
state: ConversationState,
continue_mode: bool,
*,
conversation_mode: str,
intent: str,
) -> list[QueryAnchor]:
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
inherited = self._carryover.build(
raw,
state,
continue_mode=continue_mode,
has_user_symbol=has_user_symbol,
has_user_file_path=has_user_file,
)
if (
conversation_mode == "SWITCH"
and intent == "DOCS_QA"
and not has_user_file
and not has_user_symbol
and state.active_symbol
):
inherited.append(
QueryAnchor(
type="SYMBOL",
value=state.active_symbol,
source="conversation_state",
span=None,
confidence=0.62,
)
)
return self._dedupe(anchors + inherited)
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
values = self._mapper.expand(normalized)
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
if has_symbol:
values = [value for value in values if value.lower() not in {"def", "class"}]
if not skip_tests and has_test_focus(normalized):
for candidate in ("test", "unit test"):
if candidate not in values:
values.append(candidate)
for anchor in anchors:
if anchor.type == "SYMBOL" and anchor.value not in values:
values.append(anchor.value)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
return values[:16]
def _keyword_hints(
self,
raw: str,
normalized: str,
anchors: list[QueryAnchor],
*,
skip_tests: bool,
intent: str,
state: ConversationState,
) -> list[str]:
values = self._keyword_hints_builder.build(normalized)
for anchor in anchors:
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
continue
candidate = anchor.value
if candidate not in values:
values.append(candidate)
if skip_tests:
values = [value for value in values if not is_test_related_token(value)]
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
if intent == "DOCS_QA" and not sanitized:
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
sanitized = fallback[:3]
if state.active_symbol and state.active_symbol not in sanitized:
sanitized.append(state.active_symbol)
sanitized = sanitized[:5]
return sanitized
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
if not skip_tests:
return anchors
result: list[QueryAnchor] = []
for anchor in anchors:
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
result.append(anchor)
continue
if is_test_related_token(anchor.value):
continue
result.append(anchor)
return result
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
result: list[QueryAnchor] = []
seen: set[tuple[str, str, str | None, str]] = set()
for anchor in anchors:
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
if key in seen:
continue
seen.add(key)
result.append(anchor)
return result
def _resolve_sub_intent(
self,
candidate: str,
raw: str,
anchors: list[QueryAnchor],
*,
intent: str,
negations: set[str],
) -> str:
if candidate != "EXPLAIN":
return candidate
if intent != "CODE_QA":
return candidate
text = " ".join((raw or "").lower().split())
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
is_why = any(marker in text for marker in self._WHY_MARKERS)
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
return "EXPLAIN_LOCAL"
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
return "EXPLAIN_LOCAL"
return candidate
def _looks_like_file(self, value: str) -> bool:
tail = (value or "").rsplit("/", 1)[-1]
return "." in tail

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.models import (
CodeRetrievalFilters,
ConversationState,
DocsRetrievalFilters,
HybridRetrievalFilters,
QueryAnchor,
RepoContext,
)
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
class RetrievalFilterBuilder:
def build(
self,
domains: list[str],
anchors: list[QueryAnchor],
repo_context: RepoContext,
*,
raw_query: str,
conversation_state: ConversationState | None,
conversation_mode: str,
sub_intent: str = "EXPLAIN",
) -> CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters:
path_scope = self._path_scope(
anchors,
conversation_state=conversation_state,
conversation_mode=conversation_mode,
raw_query=raw_query,
sub_intent=sub_intent,
)
if domains == ["DOCS"]:
return DocsRetrievalFilters(
path_scope=path_scope,
doc_kinds=self._doc_kinds(anchors, raw_query),
doc_language=[],
)
if domains == ["CODE"]:
return CodeRetrievalFilters(
test_policy=self._test_policy(raw_query, anchors),
path_scope=path_scope,
language=list(repo_context.languages),
)
return HybridRetrievalFilters(
test_policy=self._test_policy(raw_query, anchors),
path_scope=path_scope,
language=list(repo_context.languages),
doc_kinds=self._doc_kinds(anchors, raw_query),
doc_language=[],
)
def _test_policy(self, raw_query: str, anchors: list[QueryAnchor]) -> str:
if is_negative_test_request(raw_query):
return "EXCLUDE"
if has_test_focus(raw_query):
return "INCLUDE"
has_test_keyterm = any(anchor.type == "KEY_TERM" and is_test_related_token(anchor.value) for anchor in anchors)
return "INCLUDE" if has_test_keyterm else "EXCLUDE"
def _path_scope(
self,
anchors: list[QueryAnchor],
*,
conversation_state: ConversationState | None,
conversation_mode: str,
raw_query: str,
sub_intent: str,
) -> list[str]:
values: list[str] = []
has_user_file_anchor = False
file_values: list[str] = []
for anchor in anchors:
if anchor.type != "FILE_PATH":
continue
if anchor.source == "user_text":
has_user_file_anchor = True
if anchor.value not in values:
values.append(anchor.value)
if self._looks_like_file_path(anchor.value) and anchor.value not in file_values:
file_values.append(anchor.value)
parent = anchor.value.rsplit("/", 1)[0] if "/" in anchor.value and self._looks_like_file_path(anchor.value) else ""
if parent and parent not in values:
values.append(parent)
if sub_intent in {"OPEN_FILE", "EXPLAIN_LOCAL"} and file_values and not self._is_explicit_directory_scope(raw_query):
return file_values[:6]
if has_user_file_anchor or conversation_mode != "CONTINUE":
return values[:6]
if values:
return values[:6]
inherited = list((conversation_state.active_path_scope if conversation_state else []) or [])
return inherited[:6]
def _doc_kinds(self, anchors: list[QueryAnchor], raw_query: str) -> list[str]:
text = (raw_query or "").lower()
kinds: list[str] = []
has_readme = "readme" in text or any(
anchor.type in {"DOC_REF", "FILE_PATH"} and anchor.value.lower().endswith("readme.md")
for anchor in anchors
)
if has_readme:
kinds.append("README")
return kinds
def _looks_like_file_path(self, value: str) -> bool:
filename = value.rsplit("/", 1)[-1]
return "." in filename
def _is_explicit_directory_scope(self, raw_query: str) -> bool:
text = (raw_query or "").lower()
return any(marker in text for marker in ("в папке", "в директории", "в каталоге"))

View File

@@ -0,0 +1,118 @@
from __future__ import annotations
from app.modules.rag.contracts.enums import RagLayer
from app.modules.rag.intent_router_v2.layer_query_builder import LayerQueryBuilder
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, RepoContext, RetrievalSpec
from app.modules.rag.intent_router_v2.retrieval_filter_builder import RetrievalFilterBuilder
class RetrievalSpecFactory:
_LAYERS = {
"CODE_QA": [
(RagLayer.CODE_ENTRYPOINTS, 6),
(RagLayer.CODE_SYMBOL_CATALOG, 8),
(RagLayer.CODE_DEPENDENCY_GRAPH, 6),
(RagLayer.CODE_SOURCE_CHUNKS, 8),
],
"DOCS_QA": [
(RagLayer.DOCS_MODULE_CATALOG, 5),
(RagLayer.DOCS_FACT_INDEX, 8),
(RagLayer.DOCS_SECTION_INDEX, 8),
(RagLayer.DOCS_POLICY_INDEX, 4),
],
"GENERATE_DOCS_FROM_CODE": [
(RagLayer.CODE_SYMBOL_CATALOG, 12),
(RagLayer.CODE_DEPENDENCY_GRAPH, 8),
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_ENTRYPOINTS, 6),
],
"PROJECT_MISC": [
(RagLayer.DOCS_MODULE_CATALOG, 4),
(RagLayer.DOCS_SECTION_INDEX, 6),
(RagLayer.CODE_SYMBOL_CATALOG, 4),
(RagLayer.CODE_SOURCE_CHUNKS, 4),
],
}
_DOMAINS = {
"CODE_QA": ["CODE"],
"DOCS_QA": ["DOCS"],
"GENERATE_DOCS_FROM_CODE": ["CODE"],
"PROJECT_MISC": ["CODE", "DOCS"],
}
_RERANK = {
"CODE_QA": "code",
"DOCS_QA": "docs",
"GENERATE_DOCS_FROM_CODE": "generate",
"PROJECT_MISC": "project",
}
_OPEN_FILE_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
]
_OPEN_FILE_WITH_SYMBOL_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_SYMBOL_CATALOG, 6),
]
_EXPLAIN_LOCAL_LAYERS = [
(RagLayer.CODE_SOURCE_CHUNKS, 12),
(RagLayer.CODE_SYMBOL_CATALOG, 8),
(RagLayer.CODE_DEPENDENCY_GRAPH, 4),
]
def __init__(
self,
layer_builder: LayerQueryBuilder | None = None,
filter_builder: RetrievalFilterBuilder | None = None,
) -> None:
self._layer_builder = layer_builder or LayerQueryBuilder()
self._filter_builder = filter_builder or RetrievalFilterBuilder()
def build(
self,
intent: str,
anchors: list[QueryAnchor],
repo_context: RepoContext,
*,
raw_query: str = "",
conversation_state: ConversationState | None = None,
conversation_mode: str = "START",
sub_intent: str = "EXPLAIN",
) -> RetrievalSpec:
domains = self._domains(intent, repo_context)
layers_map = self._with_sub_intent_layers(intent, sub_intent, anchors)
layer_queries = self._layer_builder.build(intent, repo_context, domains=domains, layers_map=layers_map)
filters = self._filter_builder.build(
domains,
anchors,
repo_context,
raw_query=raw_query,
conversation_state=conversation_state,
conversation_mode=conversation_mode,
sub_intent=sub_intent,
)
return RetrievalSpec(
domains=domains,
layer_queries=layer_queries,
filters=filters,
rerank_profile=self._RERANK[intent],
)
def _domains(self, intent: str, repo_context: RepoContext) -> list[str]:
available = set(repo_context.available_domains or ["CODE", "DOCS"])
result = [domain for domain in self._DOMAINS[intent] if domain in available]
return result or list(self._DOMAINS[intent])
def _with_sub_intent_layers(
self,
intent: str,
sub_intent: str,
anchors: list[QueryAnchor],
) -> dict[str, list[tuple[str, int]]]:
if intent != "CODE_QA":
return self._LAYERS
layers_map = dict(self._LAYERS)
if sub_intent == "OPEN_FILE":
has_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
layers_map["CODE_QA"] = list(self._OPEN_FILE_WITH_SYMBOL_LAYERS if has_symbol else self._OPEN_FILE_LAYERS)
elif sub_intent == "EXPLAIN_LOCAL":
layers_map["CODE_QA"] = list(self._EXPLAIN_LOCAL_LAYERS)
return layers_map

View File

@@ -0,0 +1,72 @@
from __future__ import annotations
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
from app.modules.rag.intent_router_v2.conversation_policy import ConversationPolicy
from app.modules.rag.intent_router_v2.evidence_policy_factory import EvidencePolicyFactory
from app.modules.rag.intent_router_v2.graph_id_resolver import GraphIdResolver
from app.modules.rag.intent_router_v2.logger import IntentRouterLogger
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
from app.modules.rag.intent_router_v2.query_plan_builder import QueryPlanBuilder
from app.modules.rag.intent_router_v2.retrieval_spec_factory import RetrievalSpecFactory
class IntentRouterV2:
def __init__(
self,
classifier: IntentClassifierV2 | None = None,
conversation_policy: ConversationPolicy | None = None,
query_plan_builder: QueryPlanBuilder | None = None,
retrieval_factory: RetrievalSpecFactory | None = None,
evidence_factory: EvidencePolicyFactory | None = None,
graph_resolver: GraphIdResolver | None = None,
logger: IntentRouterLogger | None = None,
) -> None:
self._classifier = classifier or IntentClassifierV2()
self._conversation_policy = conversation_policy or ConversationPolicy()
self._query_plan_builder = query_plan_builder or QueryPlanBuilder()
self._retrieval_factory = retrieval_factory or RetrievalSpecFactory()
self._evidence_factory = evidence_factory or EvidencePolicyFactory()
self._graph_resolver = graph_resolver or GraphIdResolver()
self._logger = logger or IntentRouterLogger()
def route(
self,
user_query: str,
conversation_state: ConversationState | None = None,
repo_context: RepoContext | None = None,
) -> IntentRouterResult:
state = conversation_state or ConversationState()
context = repo_context or RepoContext()
self._logger.log_request(user_query, state, context)
decision = self._classifier.classify(user_query, state)
intent, conversation_mode = self._conversation_policy.resolve(decision, user_query, state)
query_plan = self._query_plan_builder.build(
user_query,
state,
continue_mode=conversation_mode == "CONTINUE",
conversation_mode=conversation_mode,
intent=intent,
)
result = IntentRouterResult(
intent=intent,
graph_id=self._graph_resolver.resolve(intent),
conversation_mode=conversation_mode,
query_plan=query_plan,
retrieval_spec=self._retrieval_factory.build(
intent,
query_plan.anchors,
context,
raw_query=query_plan.raw,
conversation_state=state,
conversation_mode=conversation_mode,
sub_intent=query_plan.sub_intent,
),
evidence_policy=self._evidence_factory.build(
intent,
sub_intent=query_plan.sub_intent,
negations=query_plan.negations,
has_user_anchor=any(anchor.source == "user_text" for anchor in query_plan.anchors),
),
)
self._logger.log_result(result)
return result

View File

@@ -0,0 +1,23 @@
from __future__ import annotations
class SubIntentDetector:
_OPEN_VERBS = ("открой", "посмотри", "проверь", "уточни")
_EXPLAIN_MARKERS = ("объясни", "как работает", "почему", "что делает", "зачем", "логика", "флоу", "flow")
_TEST_MARKERS = ("тест", "pytest", "unit test", "юнит")
def detect(self, raw: str, *, has_file_path: bool, negations: set[str]) -> str:
text = " ".join((raw or "").lower().split())
if not text:
return "EXPLAIN"
if has_file_path and self._has_open_verb(text) and not self._has_explain_markers(text):
return "OPEN_FILE"
if "tests" not in negations and any(marker in text for marker in self._TEST_MARKERS):
return "FIND_TESTS"
return "EXPLAIN"
def _has_open_verb(self, text: str) -> bool:
return any(text.startswith(verb) or f" {verb} " in f" {text} " for verb in self._OPEN_VERBS)
def _has_explain_markers(self, text: str) -> bool:
return any(marker in text for marker in self._EXPLAIN_MARKERS)

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
PY_KEYWORDS: set[str] = {
"and",
"as",
"assert",
"async",
"await",
"break",
"class",
"continue",
"def",
"del",
"elif",
"else",
"except",
"false",
"finally",
"for",
"from",
"global",
"if",
"import",
"in",
"is",
"lambda",
"none",
"nonlocal",
"not",
"or",
"pass",
"raise",
"return",
"true",
"try",
"while",
"with",
"yield",
}
COMMON_PATH_SEGMENTS: set[str] = {
"app",
"src",
"docs",
"tests",
"module",
"modules",
"core",
"pkg",
"lib",
}

View File

@@ -0,0 +1,67 @@
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
class RuEnTermMapper:
_CANONICAL_MAP = {
"класс": ["class"],
"метод": ["method"],
"функция": ["function", "def"],
"модуль": ["module"],
"пакет": ["package"],
"файл": ["file"],
"тест": ["test", "unit test"],
"документация": ["documentation", "docs"],
"readme": ["readme"],
}
_ENGLISH_SOURCES = {
"class": ["class"],
"method": ["method"],
"function": ["function", "def"],
"module": ["module"],
"package": ["package"],
"file": ["file"],
"test": ["test", "unit test"],
"tests": ["test", "unit test"],
"documentation": ["documentation", "docs"],
"docs": ["documentation", "docs"],
"readme": ["readme"],
"def": ["def"],
}
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
def expand(self, text: str) -> list[str]:
expansions: list[str] = []
lowered = (text or "").lower()
for token in _WORD_RE.findall(lowered):
canonical = self._canonicalizer.canonicalize(token) or token
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
if "unit test" in lowered or "unit tests" in lowered:
self._extend(expansions, self._ENGLISH_SOURCES["test"])
return expansions
def key_terms(self) -> tuple[str, ...]:
return tuple(self._CANONICAL_MAP.keys())
def all_literal_terms(self) -> tuple[str, ...]:
values = set(self._canonicalizer.aliases())
values.update(self._CANONICAL_MAP.keys())
values.update(self._ENGLISH_SOURCES.keys())
for targets in self._CANONICAL_MAP.values():
values.update(target.lower() for target in targets)
for targets in self._ENGLISH_SOURCES.values():
values.update(target.lower() for target in targets)
return tuple(sorted(values))
def _extend(self, result: list[str], values: list[str]) -> None:
for value in values:
if value not in result:
result.append(value)

View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import re
_NEGATIVE_TEST_RE = re.compile(r"\b(?:не|без|кроме)\b[^.?!]{0,28}\ест", re.IGNORECASE)
_NEGATIVE_TEST_MARKERS = ("не про тест", "без тест", "кроме тест", "про прод код", "только прод", "production code")
_POSITIVE_TEST_MARKERS = (
"тест",
"tests",
"pytest",
"unit test",
"unit tests",
"тестиру",
)
_TEST_TERMS = {"тест", "тесты", "test", "tests", "pytest", "unit", "unit test", "юнит-тест", "юниттест"}
def is_negative_test_request(text: str) -> bool:
lowered = (text or "").lower()
if _NEGATIVE_TEST_RE.search(lowered):
return True
return any(marker in lowered for marker in _NEGATIVE_TEST_MARKERS)
def has_test_focus(text: str) -> bool:
lowered = (text or "").lower()
if is_negative_test_request(lowered):
return False
return any(marker in lowered for marker in _POSITIVE_TEST_MARKERS)
def is_test_related_token(value: str) -> bool:
lowered = (value or "").lower().strip()
if not lowered:
return False
if lowered in _TEST_TERMS:
return True
if lowered.startswith("test"):
return True
return lowered.startswith("тест")

View File

@@ -46,7 +46,6 @@ class RagDocumentRepository:
for doc in docs:
row = doc.to_record()
metadata = row["metadata"]
links = row["links"]
emb = row["embedding"] or []
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
conn.execute(
@@ -55,15 +54,15 @@ class RagDocumentRepository:
INSERT INTO rag_chunks (
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
layer, lang, repo_id, commit_sha, title, metadata_json, span_start, span_end, symbol_id,
qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
)
VALUES (
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
:module_id, :section_path, :doc_kind
CURRENT_TIMESTAMP, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
:span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type, :module_id,
:section_path, :doc_kind
)
"""
),
@@ -81,14 +80,12 @@ class RagDocumentRepository:
"system_component": metadata.get("system_component"),
"last_modified": metadata.get("last_modified"),
"staleness_score": metadata.get("staleness_score"),
"rag_doc_id": row["doc_id"],
"layer": row["layer"],
"lang": row["lang"],
"repo_id": row["repo_id"],
"commit_sha": row["commit_sha"],
"title": row["title"],
"metadata_json": json.dumps(metadata, ensure_ascii=True),
"links_json": json.dumps(links, ensure_ascii=True),
"span_start": row["span_start"],
"span_end": row["span_end"],
"symbol_id": metadata.get("symbol_id"),

View File

@@ -4,11 +4,14 @@ import json
from sqlalchemy import text
from app.modules.rag.retrieval.query_terms import extract_query_terms
from app.modules.rag.persistence.retrieval_statement_builder import RetrievalStatementBuilder
from app.modules.shared.db import get_engine
class RagQueryRepository:
def __init__(self) -> None:
self._builder = RetrievalStatementBuilder()
def retrieve(
self,
rag_session_id: str,
@@ -18,89 +21,47 @@ class RagQueryRepository:
limit: int = 5,
layers: list[str] | None = None,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
if path_prefixes:
or_filters = []
for idx, prefix in enumerate(path_prefixes):
key = f"path_{idx}"
params[key] = f"{prefix}%"
or_filters.append(f"path LIKE :{key}")
filters.append("(" + " OR ".join(or_filters) + ")")
term_filters = []
terms = extract_query_terms(query_text)
for idx, term in enumerate(terms):
exact_key = f"term_exact_{idx}"
prefix_key = f"term_prefix_{idx}"
contains_key = f"term_contains_{idx}"
params[exact_key] = term
params[prefix_key] = f"{term}%"
params[contains_key] = f"%{term}%"
term_filters.append(
"CASE "
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
"ELSE 100 END"
)
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
test_penalty_sql = (
"CASE "
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
"THEN 1 ELSE 0 END"
if prefer_non_tests
else "0"
sql, params = self._builder.build_retrieve(
rag_session_id,
query_embedding,
query_text=query_text,
limit=limit,
layers=layers,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)
layer_rank_sql = (
"CASE "
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
"ELSE 10 END"
)
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
{lexical_sql} AS lexical_rank,
{test_penalty_sql} AS test_penalty,
{layer_rank_sql} AS layer_rank,
(embedding <=> CAST(:emb AS vector)) AS distance
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
LIMIT :lim
"""
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
filters = ["rag_session_id = :sid"]
params: dict = {"sid": rag_session_id, "lim": limit}
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
sql = f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY id DESC
LIMIT :lim
"""
def retrieve_lexical_code(
self,
rag_session_id: str,
*,
query_text: str,
limit: int = 5,
path_prefixes: list[str] | None = None,
exclude_path_prefixes: list[str] | None = None,
exclude_like_patterns: list[str] | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
sql, params = self._builder.build_lexical_code(
rag_session_id,
query_text=query_text,
limit=limit,
path_prefixes=path_prefixes,
exclude_path_prefixes=exclude_path_prefixes,
exclude_like_patterns=exclude_like_patterns,
prefer_non_tests=prefer_non_tests,
)
if sql is None:
return []
with get_engine().connect() as conn:
rows = conn.execute(text(sql), params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]

Some files were not shown because too many files have changed in this diff Show More