Фиксация изменений
This commit is contained in:
@@ -90,6 +90,41 @@ sequenceDiagram
|
||||
Rag-->>Agent: items
|
||||
```
|
||||
|
||||
### Retrieval + project/qa reasoning
|
||||
Назначение: `RAG` вызывается не в начале runtime, а внутри отдельного graph-шага `context_retrieval` для `project/qa`.
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Agent as GraphAgentRuntime
|
||||
participant Orch as OrchestratorService
|
||||
participant G1 as conversation_understanding
|
||||
participant G2 as question_classification
|
||||
participant G3 as context_retrieval
|
||||
participant Rag as RagService
|
||||
participant G4 as context_analysis
|
||||
participant G5 as answer_composition
|
||||
|
||||
Agent->>Orch: run(task)
|
||||
Orch->>G1: execute
|
||||
G1-->>Orch: resolved_request
|
||||
Orch->>G2: execute
|
||||
G2-->>Orch: question_profile
|
||||
Orch->>G3: execute
|
||||
G3->>Rag: retrieve(query)
|
||||
Rag-->>G3: rag_items
|
||||
G3-->>Orch: source_bundle
|
||||
Orch->>G4: execute
|
||||
G4-->>Orch: analysis_brief
|
||||
Orch->>G5: execute
|
||||
G5-->>Orch: final_answer
|
||||
Orch-->>Agent: final_answer
|
||||
```
|
||||
|
||||
Для `project/qa` это означает:
|
||||
- ранний глобальный retrieval больше не нужен;
|
||||
- `RAG` возвращает записи только для конкретного шага `context_retrieval`;
|
||||
- оркестратор управляет цепочкой graph-шагов;
|
||||
- пользовательский ответ собирается после анализа, а не напрямую из сырого retrieval.
|
||||
|
||||
## 5. Слои, фиксируемые в RAG
|
||||
|
||||
### 5.1. Слои DOCS
|
||||
|
||||
36
app/modules/rag/explain/__init__.py
Normal file
36
app/modules/rag/explain/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from importlib import import_module
|
||||
|
||||
__all__ = [
|
||||
"CodeExcerpt",
|
||||
"CodeExplainRetrieverV2",
|
||||
"CodeGraphRepository",
|
||||
"EvidenceItem",
|
||||
"ExplainIntent",
|
||||
"ExplainIntentBuilder",
|
||||
"ExplainPack",
|
||||
"LayeredRetrievalGateway",
|
||||
"PromptBudgeter",
|
||||
"TracePath",
|
||||
]
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
module_map = {
|
||||
"CodeExcerpt": "app.modules.rag.explain.models",
|
||||
"EvidenceItem": "app.modules.rag.explain.models",
|
||||
"ExplainIntent": "app.modules.rag.explain.models",
|
||||
"ExplainPack": "app.modules.rag.explain.models",
|
||||
"TracePath": "app.modules.rag.explain.models",
|
||||
"ExplainIntentBuilder": "app.modules.rag.explain.intent_builder",
|
||||
"PromptBudgeter": "app.modules.rag.explain.budgeter",
|
||||
"LayeredRetrievalGateway": "app.modules.rag.explain.layered_gateway",
|
||||
"CodeGraphRepository": "app.modules.rag.explain.graph_repository",
|
||||
"CodeExplainRetrieverV2": "app.modules.rag.explain.retriever_v2",
|
||||
}
|
||||
module_name = module_map.get(name)
|
||||
if module_name is None:
|
||||
raise AttributeError(name)
|
||||
module = import_module(module_name)
|
||||
return getattr(module, name)
|
||||
BIN
app/modules/rag/explain/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
app/modules/rag/explain/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/explain/__pycache__/budgeter.cpython-312.pyc
Normal file
BIN
app/modules/rag/explain/__pycache__/budgeter.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
app/modules/rag/explain/__pycache__/models.cpython-312.pyc
Normal file
BIN
app/modules/rag/explain/__pycache__/models.cpython-312.pyc
Normal file
Binary file not shown.
BIN
app/modules/rag/explain/__pycache__/retriever_v2.cpython-312.pyc
Normal file
BIN
app/modules/rag/explain/__pycache__/retriever_v2.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
62
app/modules/rag/explain/budgeter.py
Normal file
62
app/modules/rag/explain/budgeter.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from app.modules.rag.explain.models import ExplainPack
|
||||
|
||||
|
||||
class PromptBudgeter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_paths: int = 3,
|
||||
max_symbols: int = 25,
|
||||
max_excerpts: int = 40,
|
||||
max_chars: int = 30000,
|
||||
) -> None:
|
||||
self._max_paths = max_paths
|
||||
self._max_symbols = max_symbols
|
||||
self._max_excerpts = max_excerpts
|
||||
self._max_chars = max_chars
|
||||
|
||||
def build_prompt_input(self, question: str, pack: ExplainPack) -> str:
|
||||
symbol_ids: list[str] = []
|
||||
for path in pack.trace_paths[: self._max_paths]:
|
||||
for symbol_id in path.symbol_ids:
|
||||
if symbol_id and symbol_id not in symbol_ids and len(symbol_ids) < self._max_symbols:
|
||||
symbol_ids.append(symbol_id)
|
||||
excerpts = []
|
||||
total_chars = 0
|
||||
for excerpt in pack.code_excerpts:
|
||||
if symbol_ids and excerpt.symbol_id and excerpt.symbol_id not in symbol_ids:
|
||||
continue
|
||||
body = excerpt.content.strip()
|
||||
remaining = self._max_chars - total_chars
|
||||
if remaining <= 0 or len(excerpts) >= self._max_excerpts:
|
||||
break
|
||||
if len(body) > remaining:
|
||||
body = body[:remaining].rstrip() + "...[truncated]"
|
||||
excerpts.append(
|
||||
{
|
||||
"evidence_id": excerpt.evidence_id,
|
||||
"title": excerpt.title,
|
||||
"path": excerpt.path,
|
||||
"start_line": excerpt.start_line,
|
||||
"end_line": excerpt.end_line,
|
||||
"focus": excerpt.focus,
|
||||
"content": body,
|
||||
}
|
||||
)
|
||||
total_chars += len(body)
|
||||
payload = {
|
||||
"question": question,
|
||||
"intent": pack.intent.model_dump(mode="json"),
|
||||
"selected_entrypoints": [item.model_dump(mode="json") for item in pack.selected_entrypoints[:5]],
|
||||
"seed_symbols": [item.model_dump(mode="json") for item in pack.seed_symbols[: self._max_symbols]],
|
||||
"trace_paths": [path.model_dump(mode="json") for path in pack.trace_paths[: self._max_paths]],
|
||||
"evidence_index": {key: value.model_dump(mode="json") for key, value in pack.evidence_index.items()},
|
||||
"code_excerpts": excerpts,
|
||||
"missing": pack.missing,
|
||||
"conflicts": pack.conflicts,
|
||||
}
|
||||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
59
app/modules/rag/explain/excerpt_planner.py
Normal file
59
app/modules/rag/explain/excerpt_planner.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.explain.models import CodeExcerpt, LayeredRetrievalItem
|
||||
|
||||
|
||||
class ExcerptPlanner:
|
||||
_FOCUS_TOKENS = ("raise", "except", "db", "select", "insert", "update", "delete", "http", "publish", "emit")
|
||||
|
||||
def plan(self, chunk: LayeredRetrievalItem, *, evidence_id: str, symbol_id: str | None) -> list[CodeExcerpt]:
|
||||
location = chunk.location
|
||||
if location is None:
|
||||
return []
|
||||
excerpts = [
|
||||
CodeExcerpt(
|
||||
evidence_id=evidence_id,
|
||||
symbol_id=symbol_id,
|
||||
title=chunk.title,
|
||||
path=location.path,
|
||||
start_line=location.start_line,
|
||||
end_line=location.end_line,
|
||||
content=chunk.content.strip(),
|
||||
focus="overview",
|
||||
)
|
||||
]
|
||||
focus = self._focus_excerpt(chunk, evidence_id=evidence_id, symbol_id=symbol_id)
|
||||
if focus is not None:
|
||||
excerpts.append(focus)
|
||||
return excerpts
|
||||
|
||||
def _focus_excerpt(
|
||||
self,
|
||||
chunk: LayeredRetrievalItem,
|
||||
*,
|
||||
evidence_id: str,
|
||||
symbol_id: str | None,
|
||||
) -> CodeExcerpt | None:
|
||||
location = chunk.location
|
||||
if location is None:
|
||||
return None
|
||||
lines = chunk.content.splitlines()
|
||||
for index, line in enumerate(lines):
|
||||
lowered = line.lower()
|
||||
if not any(token in lowered for token in self._FOCUS_TOKENS):
|
||||
continue
|
||||
start = max(0, index - 2)
|
||||
end = min(len(lines), index + 3)
|
||||
if end - start >= len(lines):
|
||||
return None
|
||||
return CodeExcerpt(
|
||||
evidence_id=evidence_id,
|
||||
symbol_id=symbol_id,
|
||||
title=f"{chunk.title}:focus",
|
||||
path=location.path,
|
||||
start_line=(location.start_line or 1) + start,
|
||||
end_line=(location.start_line or 1) + end - 1,
|
||||
content="\n".join(lines[start:end]).strip(),
|
||||
focus="focus",
|
||||
)
|
||||
return None
|
||||
216
app/modules/rag/explain/graph_repository.py
Normal file
216
app/modules/rag/explain/graph_repository.py
Normal file
@@ -0,0 +1,216 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.rag.explain.models import CodeLocation, LayeredRetrievalItem
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class CodeGraphRepository:
|
||||
def get_out_edges(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
src_symbol_ids: list[str],
|
||||
edge_types: list[str],
|
||||
limit_per_src: int,
|
||||
) -> list[LayeredRetrievalItem]:
|
||||
if not src_symbol_ids:
|
||||
return []
|
||||
sql = """
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE rag_session_id = :sid
|
||||
AND layer = 'C2_DEPENDENCY_GRAPH'
|
||||
AND CAST(metadata_json AS jsonb)->>'src_symbol_id' = ANY(:src_ids)
|
||||
AND CAST(metadata_json AS jsonb)->>'edge_type' = ANY(:edge_types)
|
||||
ORDER BY path, span_start
|
||||
"""
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(sql),
|
||||
{"sid": rag_session_id, "src_ids": src_symbol_ids, "edge_types": edge_types},
|
||||
).mappings().fetchall()
|
||||
grouped: dict[str, int] = {}
|
||||
items: list[LayeredRetrievalItem] = []
|
||||
for row in rows:
|
||||
metadata = self._loads(row.get("metadata_json"))
|
||||
src_symbol_id = str(metadata.get("src_symbol_id") or "")
|
||||
grouped[src_symbol_id] = grouped.get(src_symbol_id, 0) + 1
|
||||
if grouped[src_symbol_id] > limit_per_src:
|
||||
continue
|
||||
items.append(self._to_item(row, metadata))
|
||||
return items
|
||||
|
||||
def get_in_edges(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
dst_symbol_ids: list[str],
|
||||
edge_types: list[str],
|
||||
limit_per_dst: int,
|
||||
) -> list[LayeredRetrievalItem]:
|
||||
if not dst_symbol_ids:
|
||||
return []
|
||||
sql = """
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE rag_session_id = :sid
|
||||
AND layer = 'C2_DEPENDENCY_GRAPH'
|
||||
AND CAST(metadata_json AS jsonb)->>'dst_symbol_id' = ANY(:dst_ids)
|
||||
AND CAST(metadata_json AS jsonb)->>'edge_type' = ANY(:edge_types)
|
||||
ORDER BY path, span_start
|
||||
"""
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(sql),
|
||||
{"sid": rag_session_id, "dst_ids": dst_symbol_ids, "edge_types": edge_types},
|
||||
).mappings().fetchall()
|
||||
grouped: dict[str, int] = {}
|
||||
items: list[LayeredRetrievalItem] = []
|
||||
for row in rows:
|
||||
metadata = self._loads(row.get("metadata_json"))
|
||||
dst_symbol_id = str(metadata.get("dst_symbol_id") or "")
|
||||
grouped[dst_symbol_id] = grouped.get(dst_symbol_id, 0) + 1
|
||||
if grouped[dst_symbol_id] > limit_per_dst:
|
||||
continue
|
||||
items.append(self._to_item(row, metadata))
|
||||
return items
|
||||
|
||||
def resolve_symbol_by_ref(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
dst_ref: str,
|
||||
package_hint: str | None = None,
|
||||
) -> LayeredRetrievalItem | None:
|
||||
ref = (dst_ref or "").strip()
|
||||
if not ref:
|
||||
return None
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end, qname
|
||||
FROM rag_chunks
|
||||
WHERE rag_session_id = :sid
|
||||
AND layer = 'C1_SYMBOL_CATALOG'
|
||||
AND (qname = :ref OR title = :ref OR qname LIKE :tail)
|
||||
ORDER BY path
|
||||
LIMIT 12
|
||||
"""
|
||||
),
|
||||
{"sid": rag_session_id, "ref": ref, "tail": f"%{ref}"},
|
||||
).mappings().fetchall()
|
||||
best: LayeredRetrievalItem | None = None
|
||||
best_score = -1
|
||||
for row in rows:
|
||||
metadata = self._loads(row.get("metadata_json"))
|
||||
package = str(metadata.get("package_or_module") or "")
|
||||
score = 0
|
||||
if str(row.get("qname") or "") == ref:
|
||||
score += 3
|
||||
if str(row.get("title") or "") == ref:
|
||||
score += 2
|
||||
if package_hint and package.startswith(package_hint):
|
||||
score += 3
|
||||
if package_hint and package_hint in str(row.get("path") or ""):
|
||||
score += 1
|
||||
if score > best_score:
|
||||
best = self._to_item(row, metadata)
|
||||
best_score = score
|
||||
return best
|
||||
|
||||
def get_symbols_by_ids(self, rag_session_id: str, symbol_ids: list[str]) -> list[LayeredRetrievalItem]:
|
||||
if not symbol_ids:
|
||||
return []
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE rag_session_id = :sid
|
||||
AND layer = 'C1_SYMBOL_CATALOG'
|
||||
AND symbol_id = ANY(:symbol_ids)
|
||||
ORDER BY path, span_start
|
||||
"""
|
||||
),
|
||||
{"sid": rag_session_id, "symbol_ids": symbol_ids},
|
||||
).mappings().fetchall()
|
||||
return [self._to_item(row, self._loads(row.get("metadata_json"))) for row in rows]
|
||||
|
||||
def get_chunks_by_symbol_ids(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
symbol_ids: list[str],
|
||||
prefer_chunk_type: str = "symbol_block",
|
||||
) -> list[LayeredRetrievalItem]:
|
||||
symbols = self.get_symbols_by_ids(rag_session_id, symbol_ids)
|
||||
chunks: list[LayeredRetrievalItem] = []
|
||||
for symbol in symbols:
|
||||
location = symbol.location
|
||||
if location is None:
|
||||
continue
|
||||
chunk = self._chunk_for_symbol(rag_session_id, symbol, prefer_chunk_type=prefer_chunk_type)
|
||||
if chunk is not None:
|
||||
chunks.append(chunk)
|
||||
return chunks
|
||||
|
||||
def _chunk_for_symbol(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
symbol: LayeredRetrievalItem,
|
||||
*,
|
||||
prefer_chunk_type: str,
|
||||
) -> LayeredRetrievalItem | None:
|
||||
location = symbol.location
|
||||
if location is None:
|
||||
return None
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE rag_session_id = :sid
|
||||
AND layer = 'C0_SOURCE_CHUNKS'
|
||||
AND path = :path
|
||||
AND COALESCE(span_start, 0) <= :end_line
|
||||
AND COALESCE(span_end, 999999) >= :start_line
|
||||
ORDER BY
|
||||
CASE WHEN CAST(metadata_json AS jsonb)->>'chunk_type' = :prefer_chunk_type THEN 0 ELSE 1 END,
|
||||
ABS(COALESCE(span_start, 0) - :start_line)
|
||||
LIMIT 1
|
||||
"""
|
||||
),
|
||||
{
|
||||
"sid": rag_session_id,
|
||||
"path": location.path,
|
||||
"start_line": location.start_line or 0,
|
||||
"end_line": location.end_line or 999999,
|
||||
"prefer_chunk_type": prefer_chunk_type,
|
||||
},
|
||||
).mappings().fetchall()
|
||||
if not rows:
|
||||
return None
|
||||
row = rows[0]
|
||||
return self._to_item(row, self._loads(row.get("metadata_json")))
|
||||
|
||||
def _to_item(self, row, metadata: dict) -> LayeredRetrievalItem:
|
||||
return LayeredRetrievalItem(
|
||||
source=str(row.get("path") or ""),
|
||||
content=str(row.get("content") or ""),
|
||||
layer=str(row.get("layer") or ""),
|
||||
title=str(row.get("title") or ""),
|
||||
metadata=metadata,
|
||||
location=CodeLocation(
|
||||
path=str(row.get("path") or ""),
|
||||
start_line=row.get("span_start"),
|
||||
end_line=row.get("span_end"),
|
||||
),
|
||||
)
|
||||
|
||||
def _loads(self, value) -> dict:
|
||||
if not value:
|
||||
return {}
|
||||
return json.loads(str(value))
|
||||
102
app/modules/rag/explain/intent_builder.py
Normal file
102
app/modules/rag/explain/intent_builder.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.explain.models import ExplainHints, ExplainIntent
|
||||
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
||||
|
||||
|
||||
class ExplainIntentBuilder:
|
||||
_ROUTE_RE = re.compile(r"(/[A-Za-z0-9_./{}:-]+)")
|
||||
_FILE_RE = re.compile(r"([A-Za-z0-9_./-]+\.py)")
|
||||
_SYMBOL_RE = re.compile(r"\b([A-Z][A-Za-z0-9_]*\.[A-Za-z_][A-Za-z0-9_]*|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b")
|
||||
_COMMAND_RE = re.compile(r"`([A-Za-z0-9:_-]+)`")
|
||||
_TEST_KEYWORDS = (
|
||||
"тест",
|
||||
"tests",
|
||||
"test ",
|
||||
"unit-test",
|
||||
"unit test",
|
||||
"юнит-тест",
|
||||
"pytest",
|
||||
"spec",
|
||||
"как покрыто тестами",
|
||||
"как проверяется",
|
||||
"how is it tested",
|
||||
"how it's tested",
|
||||
)
|
||||
|
||||
def build(self, user_query: str) -> ExplainIntent:
|
||||
normalized = " ".join((user_query or "").split())
|
||||
lowered = normalized.lower()
|
||||
keywords = self._keywords(normalized)
|
||||
hints = ExplainHints(
|
||||
paths=self._dedupe(self._FILE_RE.findall(normalized)),
|
||||
symbols=self._symbols(normalized),
|
||||
endpoints=self._dedupe(self._ROUTE_RE.findall(normalized)),
|
||||
commands=self._commands(normalized, lowered),
|
||||
)
|
||||
return ExplainIntent(
|
||||
raw_query=user_query,
|
||||
normalized_query=normalized,
|
||||
keywords=keywords[:12],
|
||||
hints=hints,
|
||||
include_tests=self._include_tests(lowered),
|
||||
expected_entry_types=self._entry_types(lowered, hints),
|
||||
depth=self._depth(lowered),
|
||||
)
|
||||
|
||||
def _keywords(self, text: str) -> list[str]:
|
||||
keywords = extract_query_terms(text)
|
||||
for token in self._symbols(text):
|
||||
if token not in keywords:
|
||||
keywords.append(token)
|
||||
for token in self._ROUTE_RE.findall(text):
|
||||
if token not in keywords:
|
||||
keywords.append(token)
|
||||
return self._dedupe(keywords)
|
||||
|
||||
def _symbols(self, text: str) -> list[str]:
|
||||
values = []
|
||||
for raw in self._SYMBOL_RE.findall(text):
|
||||
token = raw.strip()
|
||||
if len(token) < 3:
|
||||
continue
|
||||
if token.endswith(".py"):
|
||||
continue
|
||||
values.append(token)
|
||||
return self._dedupe(values)
|
||||
|
||||
def _commands(self, text: str, lowered: str) -> list[str]:
|
||||
values = list(self._COMMAND_RE.findall(text))
|
||||
if " command " in f" {lowered} ":
|
||||
values.extend(re.findall(r"command\s+([A-Za-z0-9:_-]+)", lowered))
|
||||
if " cli " in f" {lowered} ":
|
||||
values.extend(re.findall(r"cli\s+([A-Za-z0-9:_-]+)", lowered))
|
||||
return self._dedupe(values)
|
||||
|
||||
def _entry_types(self, lowered: str, hints: ExplainHints) -> list[str]:
|
||||
if hints.endpoints or any(token in lowered for token in ("endpoint", "route", "handler", "http", "api")):
|
||||
return ["http"]
|
||||
if hints.commands or any(token in lowered for token in ("cli", "command", "click", "typer")):
|
||||
return ["cli"]
|
||||
return ["http", "cli"]
|
||||
|
||||
def _depth(self, lowered: str) -> str:
|
||||
if any(token in lowered for token in ("deep", "подроб", "деталь", "full flow", "trace")):
|
||||
return "deep"
|
||||
if any(token in lowered for token in ("high level", "overview", "кратко", "summary")):
|
||||
return "high"
|
||||
return "medium"
|
||||
|
||||
def _include_tests(self, lowered: str) -> bool:
|
||||
normalized = f" {lowered} "
|
||||
return any(token in normalized for token in self._TEST_KEYWORDS)
|
||||
|
||||
def _dedupe(self, values: list[str]) -> list[str]:
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
item = value.strip()
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
289
app/modules/rag/explain/layered_gateway.py
Normal file
289
app/modules/rag/explain/layered_gateway.py
Normal file
@@ -0,0 +1,289 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Callable
|
||||
|
||||
from app.modules.rag.explain.models import CodeLocation, LayeredRetrievalItem
|
||||
from app.modules.rag.retrieval.test_filter import build_test_filters, debug_disable_test_filter
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.modules.rag.persistence.repository import RagRepository
|
||||
from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class LayerRetrievalResult:
|
||||
items: list[LayeredRetrievalItem]
|
||||
missing: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class LayeredRetrievalGateway:
|
||||
def __init__(self, repository: RagRepository, embedder: GigaChatEmbedder) -> None:
|
||||
self._repository = repository
|
||||
self._embedder = embedder
|
||||
|
||||
def retrieve_layer(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
query: str,
|
||||
layer: str,
|
||||
*,
|
||||
limit: int,
|
||||
path_prefixes: list[str] | None = None,
|
||||
exclude_tests: bool = True,
|
||||
prefer_non_tests: bool = False,
|
||||
include_spans: bool = False,
|
||||
) -> LayerRetrievalResult:
|
||||
effective_exclude_tests = exclude_tests and not debug_disable_test_filter()
|
||||
filter_args = self._filter_args(effective_exclude_tests)
|
||||
query_embedding: list[float] | None = None
|
||||
try:
|
||||
query_embedding = self._embedder.embed([query])[0]
|
||||
rows = self._repository.retrieve(
|
||||
rag_session_id,
|
||||
query_embedding,
|
||||
query_text=query,
|
||||
limit=limit,
|
||||
layers=[layer],
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=filter_args["exclude_path_prefixes"],
|
||||
exclude_like_patterns=filter_args["exclude_like_patterns"],
|
||||
prefer_non_tests=prefer_non_tests or not effective_exclude_tests,
|
||||
)
|
||||
return self._success_result(
|
||||
rows,
|
||||
rag_session_id=rag_session_id,
|
||||
label="layered retrieval",
|
||||
include_spans=include_spans,
|
||||
layer=layer,
|
||||
exclude_tests=effective_exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
)
|
||||
except Exception as exc:
|
||||
if query_embedding is None:
|
||||
self._log_failure(
|
||||
label="layered retrieval",
|
||||
rag_session_id=rag_session_id,
|
||||
layer=layer,
|
||||
exclude_tests=effective_exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=exc,
|
||||
)
|
||||
return LayerRetrievalResult(items=[], missing=[self._failure_missing(f"layer:{layer} retrieval_failed", exc)])
|
||||
retry_result = self._retry_without_test_filter(
|
||||
operation=lambda: self._repository.retrieve(
|
||||
rag_session_id,
|
||||
query_embedding,
|
||||
query_text=query,
|
||||
limit=limit,
|
||||
layers=[layer],
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=None,
|
||||
exclude_like_patterns=None,
|
||||
prefer_non_tests=True,
|
||||
),
|
||||
label="layered retrieval",
|
||||
rag_session_id=rag_session_id,
|
||||
include_spans=include_spans,
|
||||
layer=layer,
|
||||
exclude_tests=effective_exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=exc,
|
||||
missing_prefix=f"layer:{layer} retrieval_failed",
|
||||
)
|
||||
if retry_result is not None:
|
||||
return retry_result
|
||||
return LayerRetrievalResult(items=[], missing=[self._failure_missing(f"layer:{layer} retrieval_failed", exc)])
|
||||
|
||||
def retrieve_lexical_code(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
query: str,
|
||||
*,
|
||||
limit: int,
|
||||
path_prefixes: list[str] | None = None,
|
||||
exclude_tests: bool = True,
|
||||
include_spans: bool = False,
|
||||
) -> LayerRetrievalResult:
|
||||
effective_exclude_tests = exclude_tests and not debug_disable_test_filter()
|
||||
filter_args = self._filter_args(effective_exclude_tests)
|
||||
try:
|
||||
rows = self._repository.retrieve_lexical_code(
|
||||
rag_session_id,
|
||||
query_text=query,
|
||||
limit=limit,
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=filter_args["exclude_path_prefixes"],
|
||||
exclude_like_patterns=filter_args["exclude_like_patterns"],
|
||||
prefer_non_tests=not effective_exclude_tests,
|
||||
)
|
||||
return self._success_result(
|
||||
rows,
|
||||
rag_session_id=rag_session_id,
|
||||
label="lexical retrieval",
|
||||
include_spans=include_spans,
|
||||
exclude_tests=effective_exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
)
|
||||
except Exception as exc:
|
||||
retry_result = self._retry_without_test_filter(
|
||||
operation=lambda: self._repository.retrieve_lexical_code(
|
||||
rag_session_id,
|
||||
query_text=query,
|
||||
limit=limit,
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=None,
|
||||
exclude_like_patterns=None,
|
||||
prefer_non_tests=True,
|
||||
),
|
||||
label="lexical retrieval",
|
||||
rag_session_id=rag_session_id,
|
||||
include_spans=include_spans,
|
||||
exclude_tests=effective_exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=exc,
|
||||
missing_prefix="layer:C0 lexical_retrieval_failed",
|
||||
)
|
||||
if retry_result is not None:
|
||||
return retry_result
|
||||
return LayerRetrievalResult(items=[], missing=[self._failure_missing("layer:C0 lexical_retrieval_failed", exc)])
|
||||
|
||||
def _retry_without_test_filter(
|
||||
self,
|
||||
*,
|
||||
operation: Callable[[], list[dict]],
|
||||
label: str,
|
||||
rag_session_id: str,
|
||||
include_spans: bool,
|
||||
exclude_tests: bool,
|
||||
path_prefixes: list[str] | None,
|
||||
exc: Exception,
|
||||
missing_prefix: str,
|
||||
layer: str | None = None,
|
||||
) -> LayerRetrievalResult | None:
|
||||
if not exclude_tests:
|
||||
self._log_failure(
|
||||
label=label,
|
||||
rag_session_id=rag_session_id,
|
||||
layer=layer,
|
||||
exclude_tests=exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=exc,
|
||||
)
|
||||
return None
|
||||
self._log_failure(
|
||||
label=label,
|
||||
rag_session_id=rag_session_id,
|
||||
layer=layer,
|
||||
exclude_tests=exclude_tests,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=exc,
|
||||
retried_without_test_filter=True,
|
||||
)
|
||||
try:
|
||||
rows = operation()
|
||||
except Exception as retry_exc:
|
||||
self._log_failure(
|
||||
label=f"{label} retry",
|
||||
rag_session_id=rag_session_id,
|
||||
layer=layer,
|
||||
exclude_tests=False,
|
||||
path_prefixes=path_prefixes,
|
||||
exc=retry_exc,
|
||||
)
|
||||
return None
|
||||
result = self._success_result(
|
||||
rows,
|
||||
rag_session_id=rag_session_id,
|
||||
label=f"{label} retry",
|
||||
include_spans=include_spans,
|
||||
layer=layer,
|
||||
exclude_tests=False,
|
||||
path_prefixes=path_prefixes,
|
||||
)
|
||||
result.missing.append(f"{missing_prefix}:retried_without_test_filter")
|
||||
return result
|
||||
|
||||
def _success_result(
|
||||
self,
|
||||
rows: list[dict],
|
||||
*,
|
||||
rag_session_id: str,
|
||||
label: str,
|
||||
include_spans: bool,
|
||||
exclude_tests: bool,
|
||||
path_prefixes: list[str] | None,
|
||||
layer: str | None = None,
|
||||
) -> LayerRetrievalResult:
|
||||
items = [self._to_item(row, include_spans=include_spans) for row in rows]
|
||||
LOGGER.warning(
|
||||
"%s: rag_session_id=%s layer=%s exclude_tests=%s path_prefixes=%s returned_count=%s top_paths=%s",
|
||||
label,
|
||||
rag_session_id,
|
||||
layer,
|
||||
exclude_tests,
|
||||
path_prefixes or [],
|
||||
len(items),
|
||||
[item.source for item in items[:3]],
|
||||
)
|
||||
return LayerRetrievalResult(items=items)
|
||||
|
||||
def _log_failure(
|
||||
self,
|
||||
*,
|
||||
label: str,
|
||||
rag_session_id: str,
|
||||
exclude_tests: bool,
|
||||
path_prefixes: list[str] | None,
|
||||
exc: Exception,
|
||||
layer: str | None = None,
|
||||
retried_without_test_filter: bool = False,
|
||||
) -> None:
|
||||
LOGGER.warning(
|
||||
"%s failed: rag_session_id=%s layer=%s exclude_tests=%s path_prefixes=%s retried_without_test_filter=%s error=%s",
|
||||
label,
|
||||
rag_session_id,
|
||||
layer,
|
||||
exclude_tests,
|
||||
path_prefixes or [],
|
||||
retried_without_test_filter,
|
||||
self._exception_summary(exc),
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
def _filter_args(self, exclude_tests: bool) -> dict[str, list[str] | None]:
|
||||
test_filters = build_test_filters() if exclude_tests else None
|
||||
return {
|
||||
"exclude_path_prefixes": test_filters.exclude_path_prefixes if test_filters else None,
|
||||
"exclude_like_patterns": test_filters.exclude_like_patterns if test_filters else None,
|
||||
}
|
||||
|
||||
def _failure_missing(self, prefix: str, exc: Exception) -> str:
|
||||
return f"{prefix}:{self._exception_summary(exc)}"
|
||||
|
||||
def _exception_summary(self, exc: Exception) -> str:
|
||||
message = " ".join(str(exc).split())
|
||||
if len(message) > 180:
|
||||
message = message[:177] + "..."
|
||||
return f"{type(exc).__name__}:{message or 'no_message'}"
|
||||
|
||||
def _to_item(self, row: dict, *, include_spans: bool) -> LayeredRetrievalItem:
|
||||
location = None
|
||||
if include_spans:
|
||||
location = CodeLocation(
|
||||
path=str(row.get("path") or ""),
|
||||
start_line=row.get("span_start"),
|
||||
end_line=row.get("span_end"),
|
||||
)
|
||||
return LayeredRetrievalItem(
|
||||
source=str(row.get("path") or ""),
|
||||
content=str(row.get("content") or ""),
|
||||
layer=str(row.get("layer") or ""),
|
||||
title=str(row.get("title") or ""),
|
||||
metadata=dict(row.get("metadata", {}) or {}),
|
||||
score=row.get("distance"),
|
||||
location=location,
|
||||
)
|
||||
91
app/modules/rag/explain/models.py
Normal file
91
app/modules/rag/explain/models.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class ExplainHints(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
paths: list[str] = Field(default_factory=list)
|
||||
symbols: list[str] = Field(default_factory=list)
|
||||
endpoints: list[str] = Field(default_factory=list)
|
||||
commands: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ExplainIntent(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
raw_query: str
|
||||
normalized_query: str
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
hints: ExplainHints = Field(default_factory=ExplainHints)
|
||||
include_tests: bool = False
|
||||
expected_entry_types: list[Literal["http", "cli"]] = Field(default_factory=list)
|
||||
depth: Literal["high", "medium", "deep"] = "medium"
|
||||
|
||||
|
||||
class CodeLocation(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
path: str
|
||||
start_line: int | None = None
|
||||
end_line: int | None = None
|
||||
|
||||
|
||||
class LayeredRetrievalItem(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
source: str
|
||||
content: str
|
||||
layer: str
|
||||
title: str
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
score: float | None = None
|
||||
location: CodeLocation | None = None
|
||||
|
||||
|
||||
class TracePath(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
symbol_ids: list[str] = Field(default_factory=list)
|
||||
score: float = 0.0
|
||||
entrypoint_id: str | None = None
|
||||
notes: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class EvidenceItem(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
evidence_id: str
|
||||
kind: Literal["entrypoint", "symbol", "edge", "excerpt"]
|
||||
summary: str
|
||||
location: CodeLocation | None = None
|
||||
supports: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class CodeExcerpt(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
evidence_id: str
|
||||
symbol_id: str | None = None
|
||||
title: str
|
||||
path: str
|
||||
start_line: int | None = None
|
||||
end_line: int | None = None
|
||||
content: str
|
||||
focus: str = "overview"
|
||||
|
||||
|
||||
class ExplainPack(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
intent: ExplainIntent
|
||||
selected_entrypoints: list[LayeredRetrievalItem] = Field(default_factory=list)
|
||||
seed_symbols: list[LayeredRetrievalItem] = Field(default_factory=list)
|
||||
trace_paths: list[TracePath] = Field(default_factory=list)
|
||||
evidence_index: dict[str, EvidenceItem] = Field(default_factory=dict)
|
||||
code_excerpts: list[CodeExcerpt] = Field(default_factory=list)
|
||||
missing: list[str] = Field(default_factory=list)
|
||||
conflicts: list[str] = Field(default_factory=list)
|
||||
328
app/modules/rag/explain/retriever_v2.py
Normal file
328
app/modules/rag/explain/retriever_v2.py
Normal file
@@ -0,0 +1,328 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from app.modules.rag.contracts.enums import RagLayer
|
||||
from app.modules.rag.explain.intent_builder import ExplainIntentBuilder
|
||||
from app.modules.rag.explain.layered_gateway import LayerRetrievalResult, LayeredRetrievalGateway
|
||||
from app.modules.rag.explain.models import CodeExcerpt, EvidenceItem, ExplainPack, LayeredRetrievalItem
|
||||
from app.modules.rag.explain.source_excerpt_fetcher import SourceExcerptFetcher
|
||||
from app.modules.rag.explain.trace_builder import TraceBuilder
|
||||
from app.modules.rag.retrieval.test_filter import exclude_tests_default, is_test_path
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_MIN_EXCERPTS = 2
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.modules.rag.explain.graph_repository import CodeGraphRepository
|
||||
from app.modules.rag.explain.models import ExplainIntent
|
||||
|
||||
|
||||
class CodeExplainRetrieverV2:
|
||||
def __init__(
|
||||
self,
|
||||
gateway: LayeredRetrievalGateway,
|
||||
graph_repository: CodeGraphRepository,
|
||||
intent_builder: ExplainIntentBuilder | None = None,
|
||||
trace_builder: TraceBuilder | None = None,
|
||||
excerpt_fetcher: SourceExcerptFetcher | None = None,
|
||||
) -> None:
|
||||
self._gateway = gateway
|
||||
self._graph = graph_repository
|
||||
self._intent_builder = intent_builder or ExplainIntentBuilder()
|
||||
self._trace_builder = trace_builder or TraceBuilder(graph_repository)
|
||||
self._excerpt_fetcher = excerpt_fetcher or SourceExcerptFetcher(graph_repository)
|
||||
|
||||
def build_pack(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
user_query: str,
|
||||
*,
|
||||
file_candidates: list[dict] | None = None,
|
||||
) -> ExplainPack:
|
||||
intent = self._intent_builder.build(user_query)
|
||||
path_prefixes = _path_prefixes(intent, file_candidates or [])
|
||||
exclude_tests = exclude_tests_default() and not intent.include_tests
|
||||
pack = self._run_pass(rag_session_id, intent, path_prefixes, exclude_tests=exclude_tests)
|
||||
if exclude_tests and len(pack.code_excerpts) < _MIN_EXCERPTS:
|
||||
self._merge_test_fallback(pack, rag_session_id, intent, path_prefixes)
|
||||
self._log_pack(rag_session_id, pack)
|
||||
return pack
|
||||
|
||||
def _run_pass(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
intent: ExplainIntent,
|
||||
path_prefixes: list[str],
|
||||
*,
|
||||
exclude_tests: bool,
|
||||
) -> ExplainPack:
|
||||
missing: list[str] = []
|
||||
entrypoints_result = self._entrypoints(rag_session_id, intent, path_prefixes, exclude_tests=exclude_tests)
|
||||
missing.extend(entrypoints_result.missing)
|
||||
selected_entrypoints = self._filter_entrypoints(intent, entrypoints_result.items)
|
||||
if not selected_entrypoints:
|
||||
missing.append("layer:C3 empty")
|
||||
seed_result = self._seed_symbols(rag_session_id, intent, path_prefixes, selected_entrypoints, exclude_tests=exclude_tests)
|
||||
missing.extend(seed_result.missing)
|
||||
seed_symbols = seed_result.items
|
||||
if not seed_symbols:
|
||||
missing.append("layer:C1 empty")
|
||||
depth = 4 if intent.depth == "deep" else 3 if intent.depth == "medium" else 2
|
||||
trace_paths = self._trace_builder.build_paths(rag_session_id, seed_symbols, max_depth=depth) if seed_symbols else []
|
||||
excerpts, excerpt_evidence = self._excerpt_fetcher.fetch(rag_session_id, trace_paths) if trace_paths else ([], {})
|
||||
if not excerpts:
|
||||
lexical_result = self._gateway.retrieve_lexical_code(
|
||||
rag_session_id,
|
||||
intent.normalized_query,
|
||||
limit=6,
|
||||
path_prefixes=path_prefixes or None,
|
||||
exclude_tests=exclude_tests,
|
||||
include_spans=True,
|
||||
)
|
||||
missing.extend(lexical_result.missing)
|
||||
excerpts, excerpt_evidence = _lexical_excerpts(lexical_result.items)
|
||||
if not excerpts:
|
||||
missing.append("layer:C0 empty")
|
||||
evidence_index = _evidence_index(selected_entrypoints, seed_symbols)
|
||||
evidence_index.update(excerpt_evidence)
|
||||
missing.extend(_missing(selected_entrypoints, seed_symbols, trace_paths, excerpts))
|
||||
return ExplainPack(
|
||||
intent=intent,
|
||||
selected_entrypoints=selected_entrypoints,
|
||||
seed_symbols=seed_symbols,
|
||||
trace_paths=trace_paths,
|
||||
evidence_index=evidence_index,
|
||||
code_excerpts=excerpts,
|
||||
missing=_cleanup_missing(_dedupe(missing), has_excerpts=bool(excerpts)),
|
||||
conflicts=[],
|
||||
)
|
||||
|
||||
def _merge_test_fallback(
|
||||
self,
|
||||
pack: ExplainPack,
|
||||
rag_session_id: str,
|
||||
intent: ExplainIntent,
|
||||
path_prefixes: list[str],
|
||||
) -> None:
|
||||
lexical_result = self._gateway.retrieve_lexical_code(
|
||||
rag_session_id,
|
||||
intent.normalized_query,
|
||||
limit=6,
|
||||
path_prefixes=path_prefixes or None,
|
||||
exclude_tests=False,
|
||||
include_spans=True,
|
||||
)
|
||||
excerpt_offset = len([key for key in pack.evidence_index if key.startswith("excerpt_")])
|
||||
excerpts, evidence = _lexical_excerpts(
|
||||
lexical_result.items,
|
||||
start_index=excerpt_offset,
|
||||
is_test_fallback=True,
|
||||
)
|
||||
if not excerpts:
|
||||
pack.missing = _dedupe(pack.missing + lexical_result.missing)
|
||||
return
|
||||
seen = {(item.path, item.start_line, item.end_line, item.content) for item in pack.code_excerpts}
|
||||
for excerpt in excerpts:
|
||||
key = (excerpt.path, excerpt.start_line, excerpt.end_line, excerpt.content)
|
||||
if key in seen:
|
||||
continue
|
||||
pack.code_excerpts.append(excerpt)
|
||||
seen.add(key)
|
||||
pack.evidence_index.update(evidence)
|
||||
pack.missing = _cleanup_missing(_dedupe(pack.missing + lexical_result.missing), has_excerpts=bool(pack.code_excerpts))
|
||||
|
||||
def _entrypoints(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
intent: ExplainIntent,
|
||||
path_prefixes: list[str],
|
||||
*,
|
||||
exclude_tests: bool,
|
||||
) -> LayerRetrievalResult:
|
||||
return self._gateway.retrieve_layer(
|
||||
rag_session_id,
|
||||
intent.normalized_query,
|
||||
RagLayer.CODE_ENTRYPOINTS,
|
||||
limit=6,
|
||||
path_prefixes=path_prefixes or None,
|
||||
exclude_tests=exclude_tests,
|
||||
prefer_non_tests=True,
|
||||
include_spans=True,
|
||||
)
|
||||
|
||||
def _filter_entrypoints(self, intent: ExplainIntent, items: list[LayeredRetrievalItem]) -> list[LayeredRetrievalItem]:
|
||||
if not intent.expected_entry_types:
|
||||
return items[:3]
|
||||
filtered = [item for item in items if str(item.metadata.get("entry_type") or "") in intent.expected_entry_types]
|
||||
return filtered[:3] or items[:3]
|
||||
|
||||
def _seed_symbols(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
intent: ExplainIntent,
|
||||
path_prefixes: list[str],
|
||||
entrypoints: list[LayeredRetrievalItem],
|
||||
*,
|
||||
exclude_tests: bool,
|
||||
) -> LayerRetrievalResult:
|
||||
symbol_result = self._gateway.retrieve_layer(
|
||||
rag_session_id,
|
||||
intent.normalized_query,
|
||||
RagLayer.CODE_SYMBOL_CATALOG,
|
||||
limit=12,
|
||||
path_prefixes=path_prefixes or None,
|
||||
exclude_tests=exclude_tests,
|
||||
prefer_non_tests=True,
|
||||
include_spans=True,
|
||||
)
|
||||
handlers: list[LayeredRetrievalItem] = []
|
||||
handler_ids = [str(item.metadata.get("handler_symbol_id") or "") for item in entrypoints]
|
||||
if handler_ids:
|
||||
handlers = self._graph.get_symbols_by_ids(rag_session_id, [item for item in handler_ids if item])
|
||||
seeds: list[LayeredRetrievalItem] = []
|
||||
seen: set[str] = set()
|
||||
for item in handlers + symbol_result.items:
|
||||
symbol_id = str(item.metadata.get("symbol_id") or "")
|
||||
if not symbol_id or symbol_id in seen:
|
||||
continue
|
||||
seen.add(symbol_id)
|
||||
seeds.append(item)
|
||||
if len(seeds) >= 8:
|
||||
break
|
||||
return LayerRetrievalResult(items=seeds, missing=list(symbol_result.missing))
|
||||
|
||||
def _log_pack(self, rag_session_id: str, pack: ExplainPack) -> None:
|
||||
prod_excerpt_count = len([excerpt for excerpt in pack.code_excerpts if not _is_test_excerpt(excerpt)])
|
||||
test_excerpt_count = len(pack.code_excerpts) - prod_excerpt_count
|
||||
LOGGER.warning(
|
||||
"code explain pack: rag_session_id=%s entrypoints=%s seeds=%s paths=%s excerpts=%s prod_excerpt_count=%s test_excerpt_count=%s missing=%s",
|
||||
rag_session_id,
|
||||
len(pack.selected_entrypoints),
|
||||
len(pack.seed_symbols),
|
||||
len(pack.trace_paths),
|
||||
len(pack.code_excerpts),
|
||||
prod_excerpt_count,
|
||||
test_excerpt_count,
|
||||
pack.missing,
|
||||
)
|
||||
|
||||
|
||||
def _evidence_index(
|
||||
entrypoints: list[LayeredRetrievalItem],
|
||||
seed_symbols: list[LayeredRetrievalItem],
|
||||
) -> dict[str, EvidenceItem]:
|
||||
result: dict[str, EvidenceItem] = {}
|
||||
for index, item in enumerate(entrypoints, start=1):
|
||||
evidence_id = f"entrypoint_{index}"
|
||||
result[evidence_id] = EvidenceItem(
|
||||
evidence_id=evidence_id,
|
||||
kind="entrypoint",
|
||||
summary=item.title,
|
||||
location=item.location,
|
||||
supports=[str(item.metadata.get("handler_symbol_id") or "")],
|
||||
)
|
||||
for index, item in enumerate(seed_symbols, start=1):
|
||||
evidence_id = f"symbol_{index}"
|
||||
result[evidence_id] = EvidenceItem(
|
||||
evidence_id=evidence_id,
|
||||
kind="symbol",
|
||||
summary=item.title,
|
||||
location=item.location,
|
||||
supports=[str(item.metadata.get("symbol_id") or "")],
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _missing(
|
||||
entrypoints: list[LayeredRetrievalItem],
|
||||
seed_symbols: list[LayeredRetrievalItem],
|
||||
trace_paths,
|
||||
excerpts,
|
||||
) -> list[str]:
|
||||
missing: list[str] = []
|
||||
if not entrypoints:
|
||||
missing.append("entrypoints")
|
||||
if not seed_symbols:
|
||||
missing.append("seed_symbols")
|
||||
if not trace_paths:
|
||||
missing.append("trace_paths")
|
||||
if not excerpts:
|
||||
missing.append("code_excerpts")
|
||||
return missing
|
||||
|
||||
|
||||
def _lexical_excerpts(
|
||||
items: list[LayeredRetrievalItem],
|
||||
*,
|
||||
start_index: int = 0,
|
||||
is_test_fallback: bool = False,
|
||||
) -> tuple[list[CodeExcerpt], dict[str, EvidenceItem]]:
|
||||
excerpts: list[CodeExcerpt] = []
|
||||
evidence_index: dict[str, EvidenceItem] = {}
|
||||
for item in items:
|
||||
evidence_id = f"excerpt_{start_index + len(evidence_index) + 1}"
|
||||
location = item.location
|
||||
evidence_index[evidence_id] = EvidenceItem(
|
||||
evidence_id=evidence_id,
|
||||
kind="excerpt",
|
||||
summary=item.title or item.source,
|
||||
location=location,
|
||||
supports=[],
|
||||
)
|
||||
focus = "lexical"
|
||||
if _item_is_test(item):
|
||||
focus = "test:lexical"
|
||||
elif is_test_fallback:
|
||||
focus = "lexical"
|
||||
excerpts.append(
|
||||
CodeExcerpt(
|
||||
evidence_id=evidence_id,
|
||||
symbol_id=str(item.metadata.get("symbol_id") or "") or None,
|
||||
title=item.title or item.source,
|
||||
path=item.source,
|
||||
start_line=location.start_line if location else None,
|
||||
end_line=location.end_line if location else None,
|
||||
content=item.content,
|
||||
focus=focus,
|
||||
)
|
||||
)
|
||||
return excerpts, evidence_index
|
||||
|
||||
|
||||
def _item_is_test(item: LayeredRetrievalItem) -> bool:
|
||||
return bool(item.metadata.get("is_test")) or is_test_path(item.source)
|
||||
|
||||
|
||||
def _is_test_excerpt(excerpt: CodeExcerpt) -> bool:
|
||||
return excerpt.focus.startswith("test:") or is_test_path(excerpt.path)
|
||||
|
||||
|
||||
def _path_prefixes(intent: ExplainIntent, file_candidates: list[dict]) -> list[str]:
|
||||
values: list[str] = []
|
||||
for path in intent.hints.paths:
|
||||
prefix = path.rsplit("/", 1)[0] if "/" in path else path
|
||||
if prefix and prefix not in values:
|
||||
values.append(prefix)
|
||||
for item in file_candidates[:6]:
|
||||
path = str(item.get("path") or "")
|
||||
prefix = path.rsplit("/", 1)[0] if "/" in path else ""
|
||||
if prefix and prefix not in values:
|
||||
values.append(prefix)
|
||||
return values
|
||||
|
||||
|
||||
def _cleanup_missing(values: list[str], *, has_excerpts: bool) -> list[str]:
|
||||
if not has_excerpts:
|
||||
return values
|
||||
return [value for value in values if value not in {"code_excerpts", "layer:C0 empty"}]
|
||||
|
||||
|
||||
def _dedupe(values: list[str]) -> list[str]:
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
item = value.strip()
|
||||
if item and item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
53
app/modules/rag/explain/source_excerpt_fetcher.py
Normal file
53
app/modules/rag/explain/source_excerpt_fetcher.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from app.modules.rag.explain.excerpt_planner import ExcerptPlanner
|
||||
from app.modules.rag.explain.models import CodeExcerpt, EvidenceItem, TracePath
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.modules.rag.explain.graph_repository import CodeGraphRepository
|
||||
|
||||
|
||||
class SourceExcerptFetcher:
|
||||
def __init__(self, graph_repository: CodeGraphRepository, planner: ExcerptPlanner | None = None) -> None:
|
||||
self._graph = graph_repository
|
||||
self._planner = planner or ExcerptPlanner()
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
trace_paths: list[TracePath],
|
||||
*,
|
||||
max_excerpts: int = 40,
|
||||
) -> tuple[list[CodeExcerpt], dict[str, EvidenceItem]]:
|
||||
ordered_symbol_ids: list[str] = []
|
||||
for path in trace_paths:
|
||||
for symbol_id in path.symbol_ids:
|
||||
if symbol_id and symbol_id not in ordered_symbol_ids:
|
||||
ordered_symbol_ids.append(symbol_id)
|
||||
chunks = self._graph.get_chunks_by_symbol_ids(rag_session_id, ordered_symbol_ids)
|
||||
excerpts: list[CodeExcerpt] = []
|
||||
evidence_index: dict[str, EvidenceItem] = {}
|
||||
for chunk in chunks:
|
||||
symbol_id = str(chunk.metadata.get("symbol_id") or "")
|
||||
evidence_id = f"excerpt_{len(evidence_index) + 1}"
|
||||
location = chunk.location
|
||||
evidence_index[evidence_id] = EvidenceItem(
|
||||
evidence_id=evidence_id,
|
||||
kind="excerpt",
|
||||
summary=chunk.title,
|
||||
location=location,
|
||||
supports=[symbol_id] if symbol_id else [],
|
||||
)
|
||||
is_test_chunk = bool(chunk.metadata.get("is_test")) or is_test_path(location.path if location else chunk.source)
|
||||
for excerpt in self._planner.plan(chunk, evidence_id=evidence_id, symbol_id=symbol_id):
|
||||
if len(excerpts) >= max_excerpts:
|
||||
break
|
||||
if is_test_chunk and not excerpt.focus.startswith("test:"):
|
||||
excerpt.focus = f"test:{excerpt.focus}"
|
||||
excerpts.append(excerpt)
|
||||
if len(excerpts) >= max_excerpts:
|
||||
break
|
||||
return excerpts, evidence_index
|
||||
102
app/modules/rag/explain/trace_builder.py
Normal file
102
app/modules/rag/explain/trace_builder.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from app.modules.rag.explain.models import LayeredRetrievalItem, TracePath
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.modules.rag.explain.graph_repository import CodeGraphRepository
|
||||
|
||||
|
||||
class TraceBuilder:
|
||||
def __init__(self, graph_repository: CodeGraphRepository) -> None:
|
||||
self._graph = graph_repository
|
||||
|
||||
def build_paths(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
seed_symbols: list[LayeredRetrievalItem],
|
||||
*,
|
||||
max_depth: int,
|
||||
max_paths: int = 3,
|
||||
edge_types: list[str] | None = None,
|
||||
) -> list[TracePath]:
|
||||
edges_filter = edge_types or ["calls", "imports", "inherits"]
|
||||
symbol_map = self._symbol_map(seed_symbols)
|
||||
paths: list[TracePath] = []
|
||||
for seed in seed_symbols:
|
||||
seed_id = str(seed.metadata.get("symbol_id") or "")
|
||||
if not seed_id:
|
||||
continue
|
||||
queue: list[tuple[list[str], float, list[str]]] = [([seed_id], 0.0, [])]
|
||||
while queue and len(paths) < max_paths * 3:
|
||||
current_path, score, notes = queue.pop(0)
|
||||
src_symbol_id = current_path[-1]
|
||||
out_edges = self._graph.get_out_edges(rag_session_id, [src_symbol_id], edges_filter, limit_per_src=4)
|
||||
if not out_edges or len(current_path) >= max_depth:
|
||||
paths.append(TracePath(symbol_ids=current_path, score=score, notes=notes))
|
||||
continue
|
||||
for edge in out_edges:
|
||||
metadata = edge.metadata
|
||||
dst_symbol_id = str(metadata.get("dst_symbol_id") or "")
|
||||
next_notes = list(notes)
|
||||
next_score = score + self._edge_score(edge, symbol_map.get(src_symbol_id))
|
||||
if not dst_symbol_id:
|
||||
dst_ref = str(metadata.get("dst_ref") or "")
|
||||
package_hint = self._package_hint(symbol_map.get(src_symbol_id))
|
||||
resolved = self._graph.resolve_symbol_by_ref(rag_session_id, dst_ref, package_hint=package_hint)
|
||||
if resolved is not None:
|
||||
dst_symbol_id = str(resolved.metadata.get("symbol_id") or "")
|
||||
symbol_map[dst_symbol_id] = resolved
|
||||
next_score += 2.0
|
||||
next_notes.append(f"resolved:{dst_ref}")
|
||||
if not dst_symbol_id or dst_symbol_id in current_path:
|
||||
paths.append(TracePath(symbol_ids=current_path, score=next_score, notes=next_notes))
|
||||
continue
|
||||
if dst_symbol_id not in symbol_map:
|
||||
symbols = self._graph.get_symbols_by_ids(rag_session_id, [dst_symbol_id])
|
||||
if symbols:
|
||||
symbol_map[dst_symbol_id] = symbols[0]
|
||||
queue.append((current_path + [dst_symbol_id], next_score, next_notes))
|
||||
unique = self._unique_paths(paths)
|
||||
unique.sort(key=lambda item: item.score, reverse=True)
|
||||
return unique[:max_paths] or [TracePath(symbol_ids=[seed.metadata.get("symbol_id", "")], score=0.0) for seed in seed_symbols[:1]]
|
||||
|
||||
def _edge_score(self, edge: LayeredRetrievalItem, source_symbol: LayeredRetrievalItem | None) -> float:
|
||||
metadata = edge.metadata
|
||||
score = 1.0
|
||||
if str(metadata.get("resolution") or "") == "resolved":
|
||||
score += 2.0
|
||||
source_path = source_symbol.source if source_symbol is not None else ""
|
||||
if source_path and edge.source == source_path:
|
||||
score += 1.0
|
||||
if "tests/" in edge.source or "/tests/" in edge.source:
|
||||
score -= 3.0
|
||||
return score
|
||||
|
||||
def _package_hint(self, symbol: LayeredRetrievalItem | None) -> str | None:
|
||||
if symbol is None:
|
||||
return None
|
||||
package = str(symbol.metadata.get("package_or_module") or "")
|
||||
if not package:
|
||||
return None
|
||||
return ".".join(package.split(".")[:-1]) or package
|
||||
|
||||
def _symbol_map(self, items: list[LayeredRetrievalItem]) -> dict[str, LayeredRetrievalItem]:
|
||||
result: dict[str, LayeredRetrievalItem] = {}
|
||||
for item in items:
|
||||
symbol_id = str(item.metadata.get("symbol_id") or "")
|
||||
if symbol_id:
|
||||
result[symbol_id] = item
|
||||
return result
|
||||
|
||||
def _unique_paths(self, items: list[TracePath]) -> list[TracePath]:
|
||||
result: list[TracePath] = []
|
||||
seen: set[tuple[str, ...]] = set()
|
||||
for item in items:
|
||||
key = tuple(symbol_id for symbol_id in item.symbol_ids if symbol_id)
|
||||
if not key or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(item)
|
||||
return result
|
||||
Binary file not shown.
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.code_text.chunker import CodeChunk
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
|
||||
class CodeTextDocumentBuilder:
|
||||
@@ -17,6 +18,7 @@ class CodeTextDocumentBuilder:
|
||||
"chunk_index": chunk_index,
|
||||
"chunk_type": chunk.chunk_type,
|
||||
"module_or_unit": source.path.replace("/", ".").removesuffix(".py"),
|
||||
"is_test": is_test_path(source.path),
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
)
|
||||
|
||||
Binary file not shown.
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.edges.extractor import PyEdge
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
|
||||
class EdgeDocumentBuilder:
|
||||
@@ -22,6 +23,7 @@ class EdgeDocumentBuilder:
|
||||
"dst_symbol_id": edge.dst_symbol_id,
|
||||
"dst_ref": edge.dst_ref,
|
||||
"resolution": edge.resolution,
|
||||
"is_test": is_test_path(source.path),
|
||||
"lang_payload": edge.metadata,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
|
||||
Binary file not shown.
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
|
||||
class EntrypointDocumentBuilder:
|
||||
@@ -19,6 +20,7 @@ class EntrypointDocumentBuilder:
|
||||
"framework": entrypoint.framework,
|
||||
"route_or_command": entrypoint.route_or_command,
|
||||
"handler_symbol_id": entrypoint.handler_symbol_id,
|
||||
"is_test": is_test_path(source.path),
|
||||
"lang_payload": entrypoint.metadata,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
|
||||
Binary file not shown.
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan
|
||||
from app.modules.rag.indexing.code.symbols.extractor import PySymbol
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
|
||||
class SymbolDocumentBuilder:
|
||||
@@ -26,6 +27,7 @@ class SymbolDocumentBuilder:
|
||||
"parent_symbol_id": symbol.parent_symbol_id,
|
||||
"package_or_module": source.path.replace("/", ".").removesuffix(".py"),
|
||||
"is_entry_candidate": bool(symbol.decorators),
|
||||
"is_test": is_test_path(source.path),
|
||||
"lang_payload": symbol.lang_payload,
|
||||
"artifact_type": "CODE",
|
||||
},
|
||||
|
||||
Binary file not shown.
Binary file not shown.
201
app/modules/rag/intent_router.md
Normal file
201
app/modules/rag/intent_router.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# Intent Router Specification (MVP) — v1.1
|
||||
Version: 1.1
|
||||
Scope: Routing + query normalization + anchor extraction for layered RAG (CODE + DOCS)
|
||||
|
||||
---
|
||||
|
||||
## 1) Цель
|
||||
|
||||
Intent Router принимает:
|
||||
- `user_query: string`
|
||||
- `conversation_state: object`
|
||||
- `repo_context: object` (язык/структура репо/доступные слои)
|
||||
|
||||
И возвращает:
|
||||
- `intent`
|
||||
- `graph_id`
|
||||
- `conversation_mode`
|
||||
- `query_plan` (нормализация + якоря)
|
||||
- `retrieval_spec` (запрос по слоям RAG)
|
||||
- `evidence_policy`
|
||||
|
||||
Router **не делает** retrieval и **не генерирует** ответ.
|
||||
|
||||
---
|
||||
|
||||
## 2) MVP интенты (строго 4)
|
||||
|
||||
- `CODE_QA` — объяснение/поиск по коду
|
||||
- `DOCS_QA` — объяснение/поиск по документации
|
||||
- `GENERATE_DOCS_FROM_CODE` — генерация документации по коду
|
||||
- `PROJECT_MISC` — прочие вопросы по проекту
|
||||
|
||||
---
|
||||
|
||||
## 3) Диалоговый режим (контекст темы)
|
||||
|
||||
### 3.1 Политика
|
||||
Router обязан сохранять intent в рамках темы.
|
||||
|
||||
- Если `conversation_state.active_intent` задан
|
||||
- и нет явного сигнала смены темы
|
||||
- то `intent = conversation_state.active_intent` и `conversation_mode = CONTINUE`
|
||||
|
||||
Смена intent допускается только если:
|
||||
- есть явный сигнал смены домена/задачи, или
|
||||
- новый запрос явно не соответствует текущему intent (жёсткое несоответствие)
|
||||
|
||||
---
|
||||
|
||||
## 4) Обязательная нормализация запроса и извлечение якорей
|
||||
|
||||
Router обязан выполнять:
|
||||
|
||||
### 4.1 Query normalization
|
||||
Выход должен содержать:
|
||||
- `raw` — исходный запрос
|
||||
- `normalized` — каноническая, детерминированная и meaning-preserving форма `raw`
|
||||
- `expansions[]` — добавочные токены для retrieval/rerank
|
||||
- `keyword_hints[]` — компактные ключевые токены (символы/пути/доменные термины)
|
||||
|
||||
Требования:
|
||||
- `raw` хранит исходную строку пользователя без изменений
|
||||
- `normalized` строится **только** из `raw` и безопасных правил форматирования
|
||||
- `normalized` не должен включать appended expansions, синонимы и догаданные keywords
|
||||
- все enrichment должны жить только в `expansions[]`, `keyword_hints[]`, `anchors[]`
|
||||
|
||||
### 4.2 RU→EN mapping (минимальный словарь)
|
||||
Router обязан поддерживать RU→EN mapping терминов только как `expansions`:
|
||||
|
||||
- `класс` → `class`
|
||||
- `метод` → `method`
|
||||
- `функция` → `function`, `def`
|
||||
- `модуль` → `module`
|
||||
- `пакет` → `package`
|
||||
- `файл` → `file`
|
||||
- `тест`, `юнит-тест` → `test`, `unit test`
|
||||
|
||||
Словарь должен быть расширяемым, но эти ключи обязательны.
|
||||
|
||||
### 4.3 Anchor extraction (якоря)
|
||||
Router обязан извлекать **явные якоря** из user_query и conversation_state:
|
||||
|
||||
Типы якорей:
|
||||
- `FILE_PATH` — путь/часть пути (`src/...`, `package/module.py`, `README.md`)
|
||||
- `SYMBOL` — идентификатор (CamelCase, snake_case, dotted path)
|
||||
- `DOC_REF` — ссылка на doc file/section (если есть явные маркеры)
|
||||
- `KEY_TERM` — важные термины, влияющие на retrieval (класс/метод/функция и т.п.)
|
||||
|
||||
Каждый якорь должен возвращаться структурировано.
|
||||
|
||||
---
|
||||
|
||||
## 5) Контракт выхода Router
|
||||
|
||||
Top-level:
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": "1.1",
|
||||
"intent": "CODE_QA",
|
||||
"graph_id": "CodeQAGraph",
|
||||
"conversation_mode": "CONTINUE",
|
||||
"query_plan": {
|
||||
"raw": "",
|
||||
"normalized": "",
|
||||
"expansions": [],
|
||||
"keyword_hints": [],
|
||||
"anchors": []
|
||||
},
|
||||
"retrieval_spec": {
|
||||
"domains": [],
|
||||
"layer_queries": [],
|
||||
"filters": {},
|
||||
"rerank_profile": ""
|
||||
},
|
||||
"evidence_policy": {
|
||||
"require_def": false,
|
||||
"require_flow": false,
|
||||
"require_spec": false,
|
||||
"allow_answer_without_evidence": false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
## 6) query_plan.anchors контракт
|
||||
|
||||
{
|
||||
"type": "FILE_PATH | SYMBOL | DOC_REF | KEY_TERM",
|
||||
"value": "string",
|
||||
"subtype": "optional string",
|
||||
"span": { "start": 0, "end": 0 },
|
||||
"confidence": 0.0
|
||||
}
|
||||
|
||||
Требования:
|
||||
- FILE_PATH.value хранит путь как в запросе (без попытки “исправить”)
|
||||
- SYMBOL.value хранит символ как в запросе (с сохранением регистра)
|
||||
- KEY_TERM используется для выставления expected evidence и выбора слоёв
|
||||
- anchors может быть пустым, но router должен пытаться извлечь их всегда
|
||||
|
||||
|
||||
## 7) retrieval_spec контракт (слои + фильтры)
|
||||
|
||||
### 7.1 Структура
|
||||
|
||||
{
|
||||
"domains": ["CODE", "DOCS"],
|
||||
"layer_queries": [
|
||||
{ "layer_id": "C1", "top_k": 30 },
|
||||
{ "layer_id": "C3", "top_k": 15 }
|
||||
],
|
||||
"filters": {
|
||||
"test_policy": "EXCLUDE",
|
||||
"path_scope": [],
|
||||
"language": []
|
||||
},
|
||||
"rerank_profile": "code"
|
||||
}
|
||||
|
||||
## 7.2 Требования по intent
|
||||
|
||||
- CODE_QA → domains = ["CODE"], rerank_profile="code"
|
||||
- DOCS_QA → domains = ["DOCS"], rerank_profile="docs"
|
||||
- GENERATE_DOCS_FROM_CODE → domains = ["CODE"], rerank_profile="generate"
|
||||
- PROJECT_MISC → domains = ["CODE","DOCS"], rerank_profile="project"
|
||||
|
||||
## 7.3 Требования по якорям
|
||||
|
||||
- Если найден FILE_PATH → router обязан добавить filters.path_scope (минимум: этот путь/директория)
|
||||
- Если найден SYMBOL → router обязан добавить SYMBOL в query_plan.keyword_hints и query_plan.expansions (при необходимости)
|
||||
- Если найден KEY_TERM (например "класс") → router обязан добавить RU→EN expansions
|
||||
|
||||
## 8) evidence_policy (минимальные требования)
|
||||
{
|
||||
"require_def": true,
|
||||
"require_flow": true,
|
||||
"require_spec": false,
|
||||
"allow_answer_without_evidence": false
|
||||
}
|
||||
|
||||
Требования:
|
||||
- CODE_QA: require_def=true; require_flow=true
|
||||
- DOCS_QA: require_spec=true
|
||||
- GENERATE_DOCS_FROM_CODE: require_def=true
|
||||
- PROJECT_MISC: allow_answer_without_evidence=true
|
||||
|
||||
## 9) Минимально обязательные поля (строго)
|
||||
|
||||
Router обязан всегда возвращать:
|
||||
- intent
|
||||
- graph_id
|
||||
- conversation_mode
|
||||
- query_plan.raw
|
||||
- query_plan.normalized
|
||||
- query_plan.expansions
|
||||
- query_plan.anchors
|
||||
- retrieval_spec.domains
|
||||
- retrieval_spec.layer_queries
|
||||
- retrieval_spec.filters.test_policy
|
||||
- retrieval_spec.rerank_profile
|
||||
- evidence_policy.*
|
||||
23
app/modules/rag/intent_router_v2/__init__.py
Normal file
23
app/modules/rag/intent_router_v2/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from app.modules.rag.intent_router_v2.factory import GigaChatIntentRouterFactory
|
||||
from app.modules.rag.intent_router_v2.local_runner import IntentRouterScenarioRunner
|
||||
from app.modules.rag.intent_router_v2.models import (
|
||||
ConversationState,
|
||||
IntentDecision,
|
||||
IntentRouterResult,
|
||||
QueryAnchor,
|
||||
QueryPlan,
|
||||
RepoContext,
|
||||
)
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
|
||||
__all__ = [
|
||||
"ConversationState",
|
||||
"GigaChatIntentRouterFactory",
|
||||
"IntentDecision",
|
||||
"IntentRouterResult",
|
||||
"IntentRouterScenarioRunner",
|
||||
"IntentRouterV2",
|
||||
"QueryAnchor",
|
||||
"QueryPlan",
|
||||
"RepoContext",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
144
app/modules/rag/intent_router_v2/anchor_extractor.py
Normal file
144
app/modules/rag/intent_router_v2/anchor_extractor.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import AnchorSpan, QueryAnchor
|
||||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||||
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
|
||||
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
|
||||
|
||||
_FILE_PATTERN = re.compile(r"(?P<value>\b(?:[\w.-]+/)*[\w.-]+\.(?:py|md|rst|txt|yaml|yml|json|toml|ini|cfg)\b)")
|
||||
_PATH_HINT_PATTERN = re.compile(r"(?P<value>\b(?:src|app|docs|tests)/[\w./-]*[\w-]\b)")
|
||||
_SYMBOL_PATTERN = re.compile(
|
||||
r"\b(?P<value>[A-Z][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+|[A-Z][A-Za-z0-9_]{2,}|[a-z_][A-Za-z0-9_]{2,})\b"
|
||||
)
|
||||
_DOC_SECTION_PATTERN = re.compile(r"(?:section|раздел)\s+[\"'`#]?(?P<value>[A-Za-zА-Яа-я0-9_ ./:-]{2,})", re.IGNORECASE)
|
||||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||||
|
||||
|
||||
class AnchorExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
mapper: RuEnTermMapper | None = None,
|
||||
canonicalizer: KeyTermCanonicalizer | None = None,
|
||||
) -> None:
|
||||
self._mapper = mapper or RuEnTermMapper()
|
||||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||||
|
||||
def extract(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._file_anchors(text)
|
||||
anchors.extend(self._symbol_anchors(text, file_anchors=anchors))
|
||||
anchors.extend(self._doc_ref_anchors(text))
|
||||
anchors.extend(self._key_term_anchors(text))
|
||||
return self._dedupe(anchors)
|
||||
|
||||
def _file_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._anchors_from_matches(_FILE_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.95)
|
||||
anchors.extend(self._anchors_from_matches(_PATH_HINT_PATTERN.finditer(text), anchor_type="FILE_PATH", confidence=0.8))
|
||||
return anchors
|
||||
|
||||
def _symbol_anchors(self, text: str, *, file_anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
anchors: list[QueryAnchor] = []
|
||||
path_ranges = [(anchor.span.start, anchor.span.end) for anchor in file_anchors if anchor.span is not None]
|
||||
path_segments = self._path_segments(file_anchors)
|
||||
for match in _SYMBOL_PATTERN.finditer(text):
|
||||
value = match.group("value")
|
||||
if value.endswith((".py", ".md")) or "/" in value:
|
||||
continue
|
||||
if self._is_inside_path(match.start("value"), match.end("value"), path_ranges):
|
||||
continue
|
||||
if self._is_keyword(value):
|
||||
continue
|
||||
if file_anchors and value.lower() in path_segments:
|
||||
continue
|
||||
anchors.append(self._anchor("SYMBOL", value, match.start("value"), match.end("value"), 0.88, source="user_text"))
|
||||
return anchors
|
||||
|
||||
def _doc_ref_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
anchors = self._anchors_from_matches(_DOC_SECTION_PATTERN.finditer(text), anchor_type="DOC_REF", confidence=0.75, subtype="section")
|
||||
for match in _FILE_PATTERN.finditer(text):
|
||||
value = match.group("value")
|
||||
if not value.lower().endswith((".md", ".rst", ".txt")):
|
||||
continue
|
||||
anchors.append(self._anchor("DOC_REF", value, match.start("value"), match.end("value"), 0.92, subtype="file", source="user_text"))
|
||||
return anchors
|
||||
|
||||
def _key_term_anchors(self, text: str) -> list[QueryAnchor]:
|
||||
literals = set(self._mapper.all_literal_terms())
|
||||
anchors: list[QueryAnchor] = []
|
||||
for token in _WORD_RE.finditer(text):
|
||||
value = token.group(0)
|
||||
normalized = value.lower()
|
||||
canonical = self._canonicalizer.canonicalize(value)
|
||||
if canonical is None and normalized not in literals:
|
||||
continue
|
||||
anchors.append(
|
||||
self._anchor(
|
||||
"KEY_TERM",
|
||||
canonical or value,
|
||||
token.start(),
|
||||
token.end(),
|
||||
0.9,
|
||||
source="user_text",
|
||||
)
|
||||
)
|
||||
return anchors
|
||||
|
||||
def _anchors_from_matches(
|
||||
self,
|
||||
matches,
|
||||
*,
|
||||
anchor_type: str,
|
||||
confidence: float,
|
||||
subtype: str | None = None,
|
||||
) -> list[QueryAnchor]:
|
||||
return [
|
||||
self._anchor(anchor_type, match.group("value"), match.start("value"), match.end("value"), confidence, subtype=subtype)
|
||||
for match in matches
|
||||
]
|
||||
|
||||
def _anchor(
|
||||
self,
|
||||
anchor_type: str,
|
||||
value: str,
|
||||
start: int,
|
||||
end: int,
|
||||
confidence: float,
|
||||
subtype: str | None = None,
|
||||
source: str = "user_text",
|
||||
) -> QueryAnchor:
|
||||
return QueryAnchor(
|
||||
type=anchor_type,
|
||||
value=value,
|
||||
subtype=subtype,
|
||||
source=source,
|
||||
span=AnchorSpan(start=start, end=end),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
seen: set[tuple[str, str, str | None, str]] = set()
|
||||
for anchor in anchors:
|
||||
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _is_inside_path(self, start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
|
||||
return any(start >= left and end <= right for left, right in ranges)
|
||||
|
||||
def _is_keyword(self, token: str) -> bool:
|
||||
return token.lower() in PY_KEYWORDS
|
||||
|
||||
def _path_segments(self, anchors: list[QueryAnchor]) -> set[str]:
|
||||
values: set[str] = set()
|
||||
for anchor in anchors:
|
||||
parts = re.split(r"[/.]+", anchor.value.lower())
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
values.add(part)
|
||||
return values | COMMON_PATH_SEGMENTS
|
||||
22
app/modules/rag/intent_router_v2/anchor_span_validator.py
Normal file
22
app/modules/rag/intent_router_v2/anchor_span_validator.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import QueryAnchor
|
||||
|
||||
|
||||
class AnchorSpanValidator:
|
||||
def sanitize(self, anchors: list[QueryAnchor], raw_len: int) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
for anchor in anchors:
|
||||
if anchor.source != "user_text":
|
||||
result.append(anchor.model_copy(update={"span": None}))
|
||||
continue
|
||||
if anchor.span is None:
|
||||
result.append(anchor)
|
||||
continue
|
||||
start = int(anchor.span.start)
|
||||
end = int(anchor.span.end)
|
||||
if 0 <= start < end <= raw_len:
|
||||
result.append(anchor)
|
||||
continue
|
||||
result.append(anchor.model_copy(update={"span": None, "confidence": max(anchor.confidence * 0.5, 0.0)}))
|
||||
return result
|
||||
113
app/modules/rag/intent_router_v2/classifier.py
Normal file
113
app/modules/rag/intent_router_v2/classifier.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
|
||||
from app.modules.rag.intent_router_v2.protocols import TextGenerator
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus
|
||||
|
||||
_CODE_FILE_PATH_RE = re.compile(
|
||||
r"\b(?:[\w.-]+/)*[\w.-]+\.(?:py|js|jsx|ts|tsx|java|kt|go|rb|php|c|cc|cpp|h|hpp|cs|swift|rs)(?!\w)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class IntentClassifierV2:
|
||||
_GENERATE_DOCS_MARKERS = (
|
||||
"сгенерируй документац",
|
||||
"подготовь документац",
|
||||
"создай документац",
|
||||
"генерац",
|
||||
"generate documentation",
|
||||
"write documentation",
|
||||
)
|
||||
_DOCS_MARKERS = ("документац", "readme", "docs/", ".md", "spec", "runbook", "markdown")
|
||||
_CODE_MARKERS = ("по коду", "код", "класс", "метод", "функц", "модул", "пакет", "файл", "block", "блок", "handler", "endpoint")
|
||||
|
||||
def __init__(self, llm: TextGenerator | None = None) -> None:
|
||||
self._llm = llm
|
||||
|
||||
def classify(self, user_query: str, conversation_state: ConversationState) -> IntentDecision:
|
||||
deterministic = self._deterministic(user_query)
|
||||
if deterministic:
|
||||
return deterministic
|
||||
llm_decision = self._classify_with_llm(user_query, conversation_state)
|
||||
if llm_decision:
|
||||
return llm_decision
|
||||
return IntentDecision(intent="PROJECT_MISC", confidence=0.55, reason="fallback_project_misc")
|
||||
|
||||
def _deterministic(self, user_query: str) -> IntentDecision | None:
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
if any(marker in text for marker in self._GENERATE_DOCS_MARKERS):
|
||||
return IntentDecision(intent="GENERATE_DOCS_FROM_CODE", confidence=0.97, reason="deterministic_generate_docs")
|
||||
if self._looks_like_docs_question(text):
|
||||
return IntentDecision(intent="DOCS_QA", confidence=0.9, reason="deterministic_docs")
|
||||
if self._looks_like_code_question(user_query, text):
|
||||
return IntentDecision(intent="CODE_QA", confidence=0.9, reason="deterministic_code")
|
||||
return None
|
||||
|
||||
def _classify_with_llm(self, user_query: str, conversation_state: ConversationState) -> IntentDecision | None:
|
||||
if self._llm is None:
|
||||
return None
|
||||
payload = json.dumps(
|
||||
{
|
||||
"message": user_query,
|
||||
"active_intent": conversation_state.active_intent,
|
||||
"last_query": conversation_state.last_query,
|
||||
"allowed_intents": ["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
try:
|
||||
raw = self._llm.generate("rag_intent_router_v2", payload, log_context="rag.intent_router_v2.classify").strip()
|
||||
except Exception:
|
||||
return None
|
||||
parsed = self._parse(raw)
|
||||
if parsed is None:
|
||||
return None
|
||||
return parsed
|
||||
|
||||
def _parse(self, raw: str) -> IntentDecision | None:
|
||||
candidate = self._strip_code_fence(raw)
|
||||
try:
|
||||
payload = json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
intent = str(payload.get("intent") or "").strip().upper()
|
||||
if intent not in {"CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"}:
|
||||
return None
|
||||
return IntentDecision(
|
||||
intent=intent,
|
||||
confidence=float(payload.get("confidence") or 0.7),
|
||||
reason=str(payload.get("reason") or "llm").strip() or "llm",
|
||||
)
|
||||
|
||||
def _strip_code_fence(self, text: str) -> str:
|
||||
if not text.startswith("```"):
|
||||
return text
|
||||
lines = text.splitlines()
|
||||
if len(lines) < 3 or lines[-1].strip() != "```":
|
||||
return text
|
||||
return "\n".join(lines[1:-1]).strip()
|
||||
|
||||
def _looks_like_docs_question(self, text: str) -> bool:
|
||||
if self._has_code_file_path(text):
|
||||
return False
|
||||
return any(marker in text for marker in self._DOCS_MARKERS)
|
||||
|
||||
def _looks_like_code_question(self, raw_text: str, lowered: str) -> bool:
|
||||
if self._has_code_file_path(raw_text):
|
||||
return True
|
||||
if has_test_focus(lowered):
|
||||
return True
|
||||
if any(marker in lowered for marker in self._DOCS_MARKERS) and not any(marker in lowered for marker in self._CODE_MARKERS):
|
||||
return False
|
||||
if any(marker in lowered for marker in self._CODE_MARKERS):
|
||||
return True
|
||||
if re.search(r"\b[A-Z][A-Za-z0-9_]{2,}(?:\.[A-Za-z_][A-Za-z0-9_]*)*\b", raw_text or ""):
|
||||
return True
|
||||
return bool(re.search(r"\b[a-z_][A-Za-z0-9_]{2,}\(", raw_text or ""))
|
||||
|
||||
def _has_code_file_path(self, text: str) -> bool:
|
||||
return bool(_CODE_FILE_PATH_RE.search(text or ""))
|
||||
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.followup_detector import FollowUpDetector
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor
|
||||
|
||||
|
||||
class ConversationAnchorBuilder:
|
||||
def __init__(self, followup: FollowUpDetector | None = None) -> None:
|
||||
self._followup = followup or FollowUpDetector()
|
||||
|
||||
def build(
|
||||
self,
|
||||
raw: str,
|
||||
state: ConversationState,
|
||||
*,
|
||||
continue_mode: bool,
|
||||
has_user_symbol: bool,
|
||||
has_user_file_path: bool,
|
||||
) -> list[QueryAnchor]:
|
||||
if not continue_mode:
|
||||
return []
|
||||
anchors: list[QueryAnchor] = []
|
||||
if has_user_file_path:
|
||||
return anchors
|
||||
for path in self._paths_for_carryover(state.active_path_scope):
|
||||
anchors.append(
|
||||
QueryAnchor(
|
||||
type="FILE_PATH",
|
||||
value=path,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.6,
|
||||
)
|
||||
)
|
||||
if has_user_symbol:
|
||||
return anchors
|
||||
if not self._followup.is_follow_up(raw):
|
||||
return anchors
|
||||
symbol = state.active_symbol or (state.active_code_span_symbols[0] if state.active_code_span_symbols else None)
|
||||
if symbol:
|
||||
anchors.append(
|
||||
QueryAnchor(
|
||||
type="SYMBOL",
|
||||
value=symbol,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.64,
|
||||
)
|
||||
)
|
||||
return anchors
|
||||
|
||||
def _paths_for_carryover(self, active_path_scope: list[str]) -> list[str]:
|
||||
paths = list(active_path_scope or [])
|
||||
file_paths = [path for path in paths if self._looks_like_file(path)]
|
||||
if file_paths:
|
||||
return file_paths[:1]
|
||||
return paths[:1]
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
45
app/modules/rag/intent_router_v2/conversation_policy.py
Normal file
45
app/modules/rag/intent_router_v2/conversation_policy.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentDecision
|
||||
|
||||
|
||||
class ConversationPolicy:
|
||||
_SWITCH_MARKERS = (
|
||||
"теперь",
|
||||
"а теперь",
|
||||
"давай теперь",
|
||||
"переключ",
|
||||
"new task",
|
||||
"switch to",
|
||||
"instead",
|
||||
)
|
||||
_DOCS_SIGNALS = ("документац", "readme", "docs/", ".md")
|
||||
_CODE_SIGNALS = ("по коду", "класс", "метод", "файл", "блок кода", "function", "class")
|
||||
|
||||
def resolve(self, decision: IntentDecision, user_query: str, conversation_state: ConversationState) -> tuple[str, str]:
|
||||
active_intent = conversation_state.active_intent
|
||||
if active_intent is None:
|
||||
return decision.intent, "START"
|
||||
if active_intent == decision.intent:
|
||||
return active_intent, "CONTINUE"
|
||||
if self._has_explicit_switch(user_query):
|
||||
return decision.intent, "SWITCH"
|
||||
if self._is_hard_mismatch(active_intent, decision.intent, user_query):
|
||||
return decision.intent, "SWITCH"
|
||||
return active_intent, "CONTINUE"
|
||||
|
||||
def _has_explicit_switch(self, user_query: str) -> bool:
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
return any(marker in text for marker in self._SWITCH_MARKERS)
|
||||
|
||||
def _is_hard_mismatch(self, active_intent: str, candidate_intent: str, user_query: str) -> bool:
|
||||
if active_intent == candidate_intent:
|
||||
return False
|
||||
text = " ".join((user_query or "").lower().split())
|
||||
if candidate_intent == "GENERATE_DOCS_FROM_CODE":
|
||||
return True
|
||||
if candidate_intent == "DOCS_QA":
|
||||
return any(signal in text for signal in self._DOCS_SIGNALS)
|
||||
if candidate_intent == "CODE_QA" and active_intent == "DOCS_QA":
|
||||
return any(signal in text for signal in self._CODE_SIGNALS)
|
||||
return False
|
||||
28
app/modules/rag/intent_router_v2/evidence_policy_factory.py
Normal file
28
app/modules/rag/intent_router_v2/evidence_policy_factory.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import EvidencePolicy
|
||||
|
||||
|
||||
class EvidencePolicyFactory:
|
||||
def build(
|
||||
self,
|
||||
intent: str,
|
||||
*,
|
||||
sub_intent: str = "EXPLAIN",
|
||||
negations: list[str] | None = None,
|
||||
has_user_anchor: bool = True,
|
||||
) -> EvidencePolicy:
|
||||
negations_set = set(negations or [])
|
||||
if intent == "CODE_QA":
|
||||
if sub_intent == "OPEN_FILE":
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
if sub_intent == "EXPLAIN_LOCAL":
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
if "tests" in negations_set and not has_user_anchor:
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
return EvidencePolicy(require_def=True, require_flow=True, require_spec=False, allow_answer_without_evidence=False)
|
||||
if intent == "DOCS_QA":
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=True, allow_answer_without_evidence=False)
|
||||
if intent == "GENERATE_DOCS_FROM_CODE":
|
||||
return EvidencePolicy(require_def=True, require_flow=False, require_spec=False, allow_answer_without_evidence=False)
|
||||
return EvidencePolicy(require_def=False, require_flow=False, require_spec=False, allow_answer_without_evidence=True)
|
||||
22
app/modules/rag/intent_router_v2/factory.py
Normal file
22
app/modules/rag/intent_router_v2/factory.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.agent.llm import AgentLlmService
|
||||
from app.modules.agent.prompt_loader import PromptLoader
|
||||
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
from app.modules.shared.env_loader import load_workspace_env
|
||||
from app.modules.shared.gigachat.client import GigaChatClient
|
||||
from app.modules.shared.gigachat.settings import GigaChatSettings
|
||||
from app.modules.shared.gigachat.token_provider import GigaChatTokenProvider
|
||||
|
||||
|
||||
class GigaChatIntentRouterFactory:
|
||||
def build(self) -> IntentRouterV2:
|
||||
load_workspace_env()
|
||||
settings = GigaChatSettings.from_env()
|
||||
token_provider = GigaChatTokenProvider(settings)
|
||||
client = GigaChatClient(settings, token_provider)
|
||||
prompt_loader = PromptLoader()
|
||||
llm = AgentLlmService(client=client, prompts=prompt_loader)
|
||||
classifier = IntentClassifierV2(llm=llm)
|
||||
return IntentRouterV2(classifier=classifier)
|
||||
22
app/modules/rag/intent_router_v2/followup_detector.py
Normal file
22
app/modules/rag/intent_router_v2/followup_detector.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class FollowUpDetector:
|
||||
_MARKERS = (
|
||||
"что дальше",
|
||||
"почему",
|
||||
"зачем",
|
||||
"а что",
|
||||
"уточни",
|
||||
"подробнее",
|
||||
"как именно",
|
||||
"покажи подробнее",
|
||||
)
|
||||
|
||||
def is_follow_up(self, raw: str) -> bool:
|
||||
text = " ".join((raw or "").lower().split())
|
||||
if not text:
|
||||
return False
|
||||
if len(text.split()) <= 4:
|
||||
return True
|
||||
return any(marker in text for marker in self._MARKERS)
|
||||
13
app/modules/rag/intent_router_v2/graph_id_resolver.py
Normal file
13
app/modules/rag/intent_router_v2/graph_id_resolver.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class GraphIdResolver:
|
||||
_GRAPH_MAP = {
|
||||
"CODE_QA": "CodeQAGraph",
|
||||
"DOCS_QA": "DocsQAGraph",
|
||||
"GENERATE_DOCS_FROM_CODE": "GenerateDocsFromCodeGraph",
|
||||
"PROJECT_MISC": "ProjectMiscGraph",
|
||||
}
|
||||
|
||||
def resolve(self, intent: str) -> str:
|
||||
return self._GRAPH_MAP[intent]
|
||||
34
app/modules/rag/intent_router_v2/keyword_hint_builder.py
Normal file
34
app/modules/rag/intent_router_v2/keyword_hint_builder.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.normalization import FILE_PATH_RE
|
||||
from app.modules.rag.intent_router_v2.symbol_rules import COMMON_PATH_SEGMENTS, PY_KEYWORDS
|
||||
|
||||
_IDENTIFIER_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
|
||||
|
||||
|
||||
class KeywordHintBuilder:
|
||||
def build(self, text: str) -> list[str]:
|
||||
hints: list[str] = []
|
||||
path_segments = self._path_segments(text)
|
||||
for token in _IDENTIFIER_RE.findall(text or ""):
|
||||
if token.lower() in PY_KEYWORDS:
|
||||
continue
|
||||
if token.lower() in path_segments:
|
||||
continue
|
||||
if token not in hints:
|
||||
hints.append(token)
|
||||
for match in FILE_PATH_RE.finditer(text or ""):
|
||||
candidate = match.group(0).lower()
|
||||
if candidate not in hints:
|
||||
hints.append(candidate)
|
||||
return hints[:12]
|
||||
|
||||
def _path_segments(self, text: str) -> set[str]:
|
||||
values: set[str] = set(COMMON_PATH_SEGMENTS)
|
||||
for match in FILE_PATH_RE.finditer(text or ""):
|
||||
for part in re.split(r"[/.]+", match.group(0).lower()):
|
||||
if part:
|
||||
values.add(part)
|
||||
return values
|
||||
50
app/modules/rag/intent_router_v2/keyword_hint_sanitizer.py
Normal file
50
app/modules/rag/intent_router_v2/keyword_hint_sanitizer.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import QueryAnchor
|
||||
|
||||
|
||||
class KeywordHintSanitizer:
|
||||
_GENERIC_KEY_TERMS = {"файл", "класс", "метод", "функция", "документация"}
|
||||
_DOMAIN_ALLOWLIST = {"RAG", "API", "HTTP", "SQL"}
|
||||
_DIR_SCOPE_MARKERS = ("в папке", "в директории", "в каталоге")
|
||||
|
||||
def sanitize(self, raw: str, anchors: list[QueryAnchor], base_hints: list[str]) -> list[str]:
|
||||
text = (raw or "").lower()
|
||||
allow_dirs = any(marker in text for marker in self._DIR_SCOPE_MARKERS)
|
||||
file_paths = [anchor.value for anchor in anchors if anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value)]
|
||||
known_dirs = {path.rsplit("/", 1)[0] for path in file_paths if "/" in path}
|
||||
result: list[str] = []
|
||||
|
||||
for anchor in anchors:
|
||||
if anchor.type == "FILE_PATH":
|
||||
if self._looks_like_directory(anchor.value):
|
||||
if not allow_dirs and (known_dirs or file_paths):
|
||||
continue
|
||||
self._append(result, anchor.value)
|
||||
if anchor.type == "SYMBOL":
|
||||
self._append(result, anchor.value)
|
||||
|
||||
for token in base_hints:
|
||||
if token in self._DOMAIN_ALLOWLIST:
|
||||
self._append(result, token)
|
||||
continue
|
||||
lowered = token.lower()
|
||||
if lowered in self._GENERIC_KEY_TERMS:
|
||||
continue
|
||||
if token in known_dirs and not allow_dirs:
|
||||
continue
|
||||
if "/" in token and "." not in token and not allow_dirs and file_paths:
|
||||
continue
|
||||
self._append(result, token)
|
||||
return result[:8]
|
||||
|
||||
def _append(self, values: list[str], candidate: str) -> None:
|
||||
if candidate and candidate not in values:
|
||||
values.append(candidate)
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
|
||||
def _looks_like_directory(self, value: str) -> bool:
|
||||
return "/" in (value or "") and not self._looks_like_file(value)
|
||||
29
app/modules/rag/intent_router_v2/layer_query_builder.py
Normal file
29
app/modules/rag/intent_router_v2/layer_query_builder.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import LayerQuery, RepoContext
|
||||
|
||||
|
||||
class LayerQueryBuilder:
|
||||
def build(self, intent: str, repo_context: RepoContext, *, domains: list[str], layers_map: dict[str, list[tuple[str, int]]]) -> list[LayerQuery]:
|
||||
available = set(repo_context.available_layers or [])
|
||||
result: list[LayerQuery] = []
|
||||
for layer_id, top_k in layers_map[intent]:
|
||||
if not self._layer_matches_domains(layer_id, domains):
|
||||
continue
|
||||
if available and layer_id not in available:
|
||||
continue
|
||||
result.append(LayerQuery(layer_id=layer_id, top_k=top_k))
|
||||
if result:
|
||||
return result
|
||||
return [
|
||||
LayerQuery(layer_id=layer_id, top_k=top_k)
|
||||
for layer_id, top_k in layers_map[intent]
|
||||
if self._layer_matches_domains(layer_id, domains)
|
||||
]
|
||||
|
||||
def _layer_matches_domains(self, layer_id: str, domains: list[str]) -> bool:
|
||||
if domains == ["CODE"]:
|
||||
return layer_id.startswith("C")
|
||||
if domains == ["DOCS"]:
|
||||
return layer_id.startswith("D")
|
||||
return layer_id.startswith("C") or layer_id.startswith("D")
|
||||
25
app/modules/rag/intent_router_v2/local_runner.py
Normal file
25
app/modules/rag/intent_router_v2/local_runner.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
from app.modules.rag.intent_router_v2.router import IntentRouterV2
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntentRouterScenarioRunner:
|
||||
def __init__(self, router: IntentRouterV2) -> None:
|
||||
self._router = router
|
||||
|
||||
def run(self, queries: list[str], repo_context: RepoContext | None = None) -> list[IntentRouterResult]:
|
||||
state = ConversationState()
|
||||
context = repo_context or RepoContext()
|
||||
results: list[IntentRouterResult] = []
|
||||
for index, user_query in enumerate(queries, start=1):
|
||||
LOGGER.warning("intent router local input: turn=%s user_query=%s", index, user_query)
|
||||
result = self._router.route(user_query, state, context)
|
||||
LOGGER.warning("intent router local output: turn=%s result=%s", index, result.model_dump_json(ensure_ascii=False))
|
||||
results.append(result)
|
||||
state = state.advance(result)
|
||||
return results
|
||||
22
app/modules/rag/intent_router_v2/logger.py
Normal file
22
app/modules/rag/intent_router_v2/logger.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntentRouterLogger:
|
||||
def log_request(self, user_query: str, conversation_state: ConversationState, repo_context: RepoContext) -> None:
|
||||
LOGGER.warning(
|
||||
"intent router v2 request: turn=%s active_intent=%s user_query=%s languages=%s domains=%s",
|
||||
conversation_state.turn_index + 1,
|
||||
conversation_state.active_intent,
|
||||
" ".join((user_query or "").split()),
|
||||
repo_context.languages,
|
||||
repo_context.available_domains,
|
||||
)
|
||||
|
||||
def log_result(self, result: IntentRouterResult) -> None:
|
||||
LOGGER.warning("intent router v2 result: %s", result.model_dump_json(ensure_ascii=False))
|
||||
182
app/modules/rag/intent_router_v2/models.py
Normal file
182
app/modules/rag/intent_router_v2/models.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||
|
||||
|
||||
IntentType = Literal["CODE_QA", "DOCS_QA", "GENERATE_DOCS_FROM_CODE", "PROJECT_MISC"]
|
||||
ConversationMode = Literal["START", "CONTINUE", "SWITCH"]
|
||||
AnchorType = Literal["FILE_PATH", "SYMBOL", "DOC_REF", "KEY_TERM"]
|
||||
AnchorSource = Literal["user_text", "conversation_state", "heuristic"]
|
||||
_INLINE_CODE_RE = re.compile(r"`([^`]*)`")
|
||||
_CODE_SYMBOL_RE = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b")
|
||||
|
||||
|
||||
class AnchorSpan(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
start: int = 0
|
||||
end: int = 0
|
||||
|
||||
|
||||
class QueryAnchor(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
type: AnchorType
|
||||
value: str
|
||||
source: AnchorSource = "user_text"
|
||||
subtype: str | None = None
|
||||
span: AnchorSpan | None = None
|
||||
confidence: float = 0.0
|
||||
|
||||
@field_validator("confidence")
|
||||
@classmethod
|
||||
def clamp_confidence(cls, value: float) -> float:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
|
||||
|
||||
class QueryPlan(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
raw: str
|
||||
normalized: str
|
||||
sub_intent: str = "EXPLAIN"
|
||||
negations: list[str] = Field(default_factory=list)
|
||||
expansions: list[str] = Field(default_factory=list)
|
||||
keyword_hints: list[str] = Field(default_factory=list)
|
||||
anchors: list[QueryAnchor] = Field(default_factory=list)
|
||||
|
||||
|
||||
class LayerQuery(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
layer_id: str
|
||||
top_k: int
|
||||
|
||||
|
||||
class CodeRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
test_policy: str = "EXCLUDE"
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class DocsRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
doc_kinds: list[str] = Field(default_factory=list)
|
||||
doc_language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class HybridRetrievalFilters(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
test_policy: str = "EXCLUDE"
|
||||
path_scope: list[str] = Field(default_factory=list)
|
||||
language: list[str] = Field(default_factory=list)
|
||||
doc_kinds: list[str] = Field(default_factory=list)
|
||||
doc_language: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class RetrievalSpec(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
domains: list[str] = Field(default_factory=list)
|
||||
layer_queries: list[LayerQuery] = Field(default_factory=list)
|
||||
filters: CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters = Field(default_factory=CodeRetrievalFilters)
|
||||
rerank_profile: str = ""
|
||||
|
||||
|
||||
class EvidencePolicy(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
require_def: bool = False
|
||||
require_flow: bool = False
|
||||
require_spec: bool = False
|
||||
allow_answer_without_evidence: bool = False
|
||||
|
||||
|
||||
class IntentRouterResult(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
schema_version: str = "1.1"
|
||||
intent: IntentType
|
||||
graph_id: str
|
||||
conversation_mode: ConversationMode
|
||||
query_plan: QueryPlan
|
||||
retrieval_spec: RetrievalSpec
|
||||
evidence_policy: EvidencePolicy
|
||||
|
||||
|
||||
class ConversationState(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
active_intent: IntentType | None = None
|
||||
active_domain: str | None = None
|
||||
active_anchors: list[QueryAnchor] = Field(default_factory=list)
|
||||
active_symbol: str | None = None
|
||||
active_path_scope: list[str] = Field(default_factory=list)
|
||||
active_code_span_symbols: list[str] = Field(default_factory=list)
|
||||
last_query: str = ""
|
||||
turn_index: int = 0
|
||||
|
||||
def advance(self, result: IntentRouterResult) -> "ConversationState":
|
||||
user_anchors = [anchor for anchor in result.query_plan.anchors if anchor.source == "user_text"]
|
||||
symbol_candidates = [anchor.value for anchor in user_anchors if anchor.type == "SYMBOL"]
|
||||
has_user_file_anchor = any(anchor.type == "FILE_PATH" for anchor in user_anchors)
|
||||
if symbol_candidates:
|
||||
active_symbol = symbol_candidates[-1]
|
||||
elif has_user_file_anchor:
|
||||
active_symbol = None
|
||||
else:
|
||||
active_symbol = self.active_symbol
|
||||
raw_code_symbols = _extract_code_symbols(result.query_plan.raw)
|
||||
active_code_span_symbols = raw_code_symbols or list(self.active_code_span_symbols)
|
||||
path_scope = list(getattr(result.retrieval_spec.filters, "path_scope", []) or [])
|
||||
active_domains = list(result.retrieval_spec.domains or [])
|
||||
active_domain = active_domains[0] if len(active_domains) == 1 else self.active_domain
|
||||
return ConversationState(
|
||||
active_intent=result.intent,
|
||||
active_domain=active_domain,
|
||||
active_anchors=list(user_anchors),
|
||||
active_symbol=active_symbol,
|
||||
active_path_scope=path_scope or list(self.active_path_scope),
|
||||
active_code_span_symbols=active_code_span_symbols,
|
||||
last_query=result.query_plan.raw,
|
||||
turn_index=self.turn_index + 1,
|
||||
)
|
||||
|
||||
|
||||
class RepoContext(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
languages: list[str] = Field(default_factory=list)
|
||||
available_domains: list[str] = Field(default_factory=lambda: ["CODE", "DOCS"])
|
||||
available_layers: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class IntentDecision(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
intent: IntentType
|
||||
confidence: float = 0.0
|
||||
reason: str = ""
|
||||
|
||||
@field_validator("confidence")
|
||||
@classmethod
|
||||
def clamp_confidence(cls, value: float) -> float:
|
||||
return max(0.0, min(1.0, float(value)))
|
||||
|
||||
|
||||
def _extract_code_symbols(raw: str) -> list[str]:
|
||||
symbols: list[str] = []
|
||||
for match in _INLINE_CODE_RE.finditer(raw or ""):
|
||||
snippet = match.group(1)
|
||||
for token in _CODE_SYMBOL_RE.findall(snippet):
|
||||
if token not in symbols:
|
||||
symbols.append(token)
|
||||
return symbols[:8]
|
||||
17
app/modules/rag/intent_router_v2/negation_detector.py
Normal file
17
app/modules/rag/intent_router_v2/negation_detector.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_TEST_NEG_RE = re.compile(
|
||||
r"(?:не\s+про\s+тест|без\s+тест|кроме\s+тест|про\s+прод\s+код|только\s+прод|production\s+code)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class NegationDetector:
|
||||
def detect(self, text: str) -> set[str]:
|
||||
lowered = (text or "").lower()
|
||||
negations: set[str] = set()
|
||||
if _TEST_NEG_RE.search(lowered):
|
||||
negations.add("tests")
|
||||
return negations
|
||||
57
app/modules/rag/intent_router_v2/normalization.py
Normal file
57
app/modules/rag/intent_router_v2/normalization.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
CODE_SPAN_RE = re.compile(r"`[^`]*`")
|
||||
FILE_PATH_RE = re.compile(
|
||||
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
|
||||
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
|
||||
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
|
||||
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
|
||||
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
|
||||
WS_RE = re.compile(r"\s+")
|
||||
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "“": '"', "”": '"', "‘": "'", "’": "'"})
|
||||
|
||||
|
||||
class QueryNormalizer:
|
||||
"""Builds a deterministic canonical form from raw text without enrichment."""
|
||||
|
||||
def normalize(self, raw: str) -> str:
|
||||
text = raw or ""
|
||||
protected = _ProtectedText()
|
||||
text = self._protect(text, protected)
|
||||
text = self._collapse_whitespace(text)
|
||||
text = text.translate(QUOTE_TRANSLATION)
|
||||
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
|
||||
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
|
||||
text = self._collapse_whitespace(text)
|
||||
return protected.restore(text)
|
||||
|
||||
def _protect(self, text: str, protected: "_ProtectedText") -> str:
|
||||
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
|
||||
text = pattern.sub(protected.replace, text)
|
||||
return text
|
||||
|
||||
def _collapse_whitespace(self, text: str) -> str:
|
||||
return WS_RE.sub(" ", text).strip()
|
||||
|
||||
|
||||
class _ProtectedText:
|
||||
def __init__(self) -> None:
|
||||
self._items: dict[str, str] = {}
|
||||
self._index = 0
|
||||
|
||||
def replace(self, match: re.Match[str]) -> str:
|
||||
placeholder = f"@@P{self._index}@@"
|
||||
self._items[placeholder] = match.group(0)
|
||||
self._index += 1
|
||||
return placeholder
|
||||
|
||||
def restore(self, text: str) -> str:
|
||||
restored = text
|
||||
for placeholder, value in self._items.items():
|
||||
restored = restored.replace(placeholder, value)
|
||||
return restored
|
||||
48
app/modules/rag/intent_router_v2/normalization_terms.py
Normal file
48
app/modules/rag/intent_router_v2/normalization_terms.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class KeyTermCanonicalizer:
|
||||
_ALIASES: dict[str, set[str]] = {
|
||||
"файл": {
|
||||
"файл",
|
||||
"файла",
|
||||
"файле",
|
||||
"файлу",
|
||||
"файлом",
|
||||
"файлы",
|
||||
"файлов",
|
||||
"файлам",
|
||||
"файлами",
|
||||
},
|
||||
"класс": {"класс", "класса", "классе", "классу", "классом", "классы", "классов", "классам"},
|
||||
"функция": {"функция", "функции", "функцию", "функцией", "функциях"},
|
||||
"метод": {"метод", "метода", "методе", "методу", "методом", "методы"},
|
||||
"документация": {"документация", "документации", "документацию"},
|
||||
"тест": {"тест", "тесты", "тестов", "тестам", "тестами", "юнит-тест", "юниттест"},
|
||||
"модуль": {"модуль", "модуля"},
|
||||
"пакет": {"пакет"},
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._token_to_canonical = self._build_index()
|
||||
|
||||
def canonicalize(self, token: str) -> str | None:
|
||||
return self._token_to_canonical.get((token or "").lower())
|
||||
|
||||
def aliases(self) -> set[str]:
|
||||
values: set[str] = set()
|
||||
for forms in self._ALIASES.values():
|
||||
values.update(forms)
|
||||
return values
|
||||
|
||||
def is_test_term(self, token: str) -> bool:
|
||||
canonical = self.canonicalize(token)
|
||||
return canonical == "тест"
|
||||
|
||||
def _build_index(self) -> dict[str, str]:
|
||||
index: dict[str, str] = {}
|
||||
for canonical, forms in self._ALIASES.items():
|
||||
index[canonical] = canonical
|
||||
for form in forms:
|
||||
index[form] = canonical
|
||||
return index
|
||||
7
app/modules/rag/intent_router_v2/protocols.py
Normal file
7
app/modules/rag/intent_router_v2/protocols.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class TextGenerator(Protocol):
|
||||
def generate(self, prompt_name: str, user_input: str, *, log_context: str | None = None) -> str: ...
|
||||
3
app/modules/rag/intent_router_v2/query_normalizer.py
Normal file
3
app/modules/rag/intent_router_v2/query_normalizer.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
|
||||
|
||||
__all__ = ["QueryNormalizer"]
|
||||
223
app/modules/rag/intent_router_v2/query_plan_builder.py
Normal file
223
app/modules/rag/intent_router_v2/query_plan_builder.py
Normal file
@@ -0,0 +1,223 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.anchor_extractor import AnchorExtractor
|
||||
from app.modules.rag.intent_router_v2.anchor_span_validator import AnchorSpanValidator
|
||||
from app.modules.rag.intent_router_v2.conversation_anchor_builder import ConversationAnchorBuilder
|
||||
from app.modules.rag.intent_router_v2.keyword_hint_builder import KeywordHintBuilder
|
||||
from app.modules.rag.intent_router_v2.keyword_hint_sanitizer import KeywordHintSanitizer
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, QueryPlan
|
||||
from app.modules.rag.intent_router_v2.negation_detector import NegationDetector
|
||||
from app.modules.rag.intent_router_v2.normalization import QueryNormalizer
|
||||
from app.modules.rag.intent_router_v2.sub_intent_detector import SubIntentDetector
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
|
||||
from app.modules.rag.intent_router_v2.term_mapping import RuEnTermMapper
|
||||
|
||||
|
||||
class QueryPlanBuilder:
|
||||
_WHY_MARKERS = ("почему", "зачем", "откуда", "из-за чего")
|
||||
_NEXT_STEP_MARKERS = ("что дальше", "дальше что", "и что теперь", "продолжай")
|
||||
def __init__(
|
||||
self,
|
||||
normalizer: QueryNormalizer | None = None,
|
||||
extractor: AnchorExtractor | None = None,
|
||||
mapper: RuEnTermMapper | None = None,
|
||||
keyword_hints: KeywordHintBuilder | None = None,
|
||||
keyword_hint_sanitizer: KeywordHintSanitizer | None = None,
|
||||
carryover: ConversationAnchorBuilder | None = None,
|
||||
span_validator: AnchorSpanValidator | None = None,
|
||||
sub_intent_detector: SubIntentDetector | None = None,
|
||||
negation_detector: NegationDetector | None = None,
|
||||
) -> None:
|
||||
self._normalizer = normalizer or QueryNormalizer()
|
||||
self._extractor = extractor or AnchorExtractor()
|
||||
self._mapper = mapper or RuEnTermMapper()
|
||||
self._keyword_hints_builder = keyword_hints or KeywordHintBuilder()
|
||||
self._keyword_hint_sanitizer = keyword_hint_sanitizer or KeywordHintSanitizer()
|
||||
self._carryover = carryover or ConversationAnchorBuilder()
|
||||
self._span_validator = span_validator or AnchorSpanValidator()
|
||||
self._sub_intent_detector = sub_intent_detector or SubIntentDetector()
|
||||
self._negation_detector = negation_detector or NegationDetector()
|
||||
|
||||
def build(
|
||||
self,
|
||||
user_query: str,
|
||||
conversation_state: ConversationState,
|
||||
continue_mode: bool,
|
||||
*,
|
||||
conversation_mode: str = "START",
|
||||
intent: str = "PROJECT_MISC",
|
||||
) -> QueryPlan:
|
||||
raw = user_query or ""
|
||||
normalized = self._normalizer.normalize(raw)
|
||||
if not normalized and raw.strip():
|
||||
normalized = raw
|
||||
negations = self._negation_detector.detect(normalized)
|
||||
user_anchors = self._span_validator.sanitize(self._extractor.extract(raw), len(raw))
|
||||
has_file_path = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in user_anchors)
|
||||
sub_intent = self._sub_intent_detector.detect(raw, has_file_path=has_file_path, negations=negations)
|
||||
merged_anchors = self._merge_anchors(
|
||||
raw,
|
||||
user_anchors,
|
||||
conversation_state,
|
||||
continue_mode,
|
||||
conversation_mode=conversation_mode,
|
||||
intent=intent,
|
||||
)
|
||||
skip_tests = "tests" in negations or is_negative_test_request(raw)
|
||||
cleaned_anchors = self._remove_negated_test_terms(skip_tests, merged_anchors)
|
||||
sub_intent = self._resolve_sub_intent(sub_intent, raw, cleaned_anchors, intent=intent, negations=negations)
|
||||
if intent == "DOCS_QA":
|
||||
sub_intent = "EXPLAIN"
|
||||
expansions = self._expansions(normalized, cleaned_anchors, skip_tests=skip_tests)
|
||||
keyword_hints = self._keyword_hints(
|
||||
raw,
|
||||
normalized,
|
||||
cleaned_anchors,
|
||||
skip_tests=skip_tests,
|
||||
intent=intent,
|
||||
state=conversation_state,
|
||||
)
|
||||
return QueryPlan(
|
||||
raw=raw,
|
||||
normalized=normalized,
|
||||
sub_intent=sub_intent,
|
||||
negations=sorted(negations),
|
||||
expansions=expansions,
|
||||
keyword_hints=keyword_hints,
|
||||
anchors=cleaned_anchors,
|
||||
)
|
||||
|
||||
def _merge_anchors(
|
||||
self,
|
||||
raw: str,
|
||||
anchors: list[QueryAnchor],
|
||||
state: ConversationState,
|
||||
continue_mode: bool,
|
||||
*,
|
||||
conversation_mode: str,
|
||||
intent: str,
|
||||
) -> list[QueryAnchor]:
|
||||
has_user_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
|
||||
has_user_file = any(anchor.type == "FILE_PATH" and anchor.source == "user_text" for anchor in anchors)
|
||||
inherited = self._carryover.build(
|
||||
raw,
|
||||
state,
|
||||
continue_mode=continue_mode,
|
||||
has_user_symbol=has_user_symbol,
|
||||
has_user_file_path=has_user_file,
|
||||
)
|
||||
if (
|
||||
conversation_mode == "SWITCH"
|
||||
and intent == "DOCS_QA"
|
||||
and not has_user_file
|
||||
and not has_user_symbol
|
||||
and state.active_symbol
|
||||
):
|
||||
inherited.append(
|
||||
QueryAnchor(
|
||||
type="SYMBOL",
|
||||
value=state.active_symbol,
|
||||
source="conversation_state",
|
||||
span=None,
|
||||
confidence=0.62,
|
||||
)
|
||||
)
|
||||
return self._dedupe(anchors + inherited)
|
||||
|
||||
def _expansions(self, normalized: str, anchors: list[QueryAnchor], *, skip_tests: bool) -> list[str]:
|
||||
values = self._mapper.expand(normalized)
|
||||
has_symbol = any(anchor.type == "SYMBOL" for anchor in anchors)
|
||||
if has_symbol:
|
||||
values = [value for value in values if value.lower() not in {"def", "class"}]
|
||||
if not skip_tests and has_test_focus(normalized):
|
||||
for candidate in ("test", "unit test"):
|
||||
if candidate not in values:
|
||||
values.append(candidate)
|
||||
for anchor in anchors:
|
||||
if anchor.type == "SYMBOL" and anchor.value not in values:
|
||||
values.append(anchor.value)
|
||||
if skip_tests:
|
||||
values = [value for value in values if not is_test_related_token(value)]
|
||||
return values[:16]
|
||||
|
||||
def _keyword_hints(
|
||||
self,
|
||||
raw: str,
|
||||
normalized: str,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
skip_tests: bool,
|
||||
intent: str,
|
||||
state: ConversationState,
|
||||
) -> list[str]:
|
||||
values = self._keyword_hints_builder.build(normalized)
|
||||
for anchor in anchors:
|
||||
if anchor.type not in {"FILE_PATH", "SYMBOL"}:
|
||||
continue
|
||||
candidate = anchor.value
|
||||
if candidate not in values:
|
||||
values.append(candidate)
|
||||
if skip_tests:
|
||||
values = [value for value in values if not is_test_related_token(value)]
|
||||
sanitized = self._keyword_hint_sanitizer.sanitize(raw, anchors, values)
|
||||
if intent == "DOCS_QA" and not sanitized:
|
||||
fallback = list(dict.fromkeys([*self._expansions(normalized, anchors, skip_tests=skip_tests)]))
|
||||
sanitized = fallback[:3]
|
||||
if state.active_symbol and state.active_symbol not in sanitized:
|
||||
sanitized.append(state.active_symbol)
|
||||
sanitized = sanitized[:5]
|
||||
return sanitized
|
||||
|
||||
def _remove_negated_test_terms(self, skip_tests: bool, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
if not skip_tests:
|
||||
return anchors
|
||||
result: list[QueryAnchor] = []
|
||||
for anchor in anchors:
|
||||
if anchor.type not in {"KEY_TERM", "SYMBOL"}:
|
||||
result.append(anchor)
|
||||
continue
|
||||
if is_test_related_token(anchor.value):
|
||||
continue
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _dedupe(self, anchors: list[QueryAnchor]) -> list[QueryAnchor]:
|
||||
result: list[QueryAnchor] = []
|
||||
seen: set[tuple[str, str, str | None, str]] = set()
|
||||
for anchor in anchors:
|
||||
key = (anchor.type, anchor.value, anchor.subtype, anchor.source)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(anchor)
|
||||
return result
|
||||
|
||||
def _resolve_sub_intent(
|
||||
self,
|
||||
candidate: str,
|
||||
raw: str,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
intent: str,
|
||||
negations: set[str],
|
||||
) -> str:
|
||||
if candidate != "EXPLAIN":
|
||||
return candidate
|
||||
if intent != "CODE_QA":
|
||||
return candidate
|
||||
text = " ".join((raw or "").lower().split())
|
||||
has_symbol = any(anchor.type == "SYMBOL" and anchor.confidence >= 0.6 for anchor in anchors)
|
||||
has_file = any(anchor.type == "FILE_PATH" and self._looks_like_file(anchor.value) and anchor.confidence >= 0.6 for anchor in anchors)
|
||||
has_user_anchor = any(anchor.source == "user_text" for anchor in anchors)
|
||||
is_why = any(marker in text for marker in self._WHY_MARKERS)
|
||||
is_next_steps = any(marker in text for marker in self._NEXT_STEP_MARKERS)
|
||||
is_short_generic = len(text.split()) <= 4 and text.endswith("?")
|
||||
if (is_why and has_file and has_symbol) or ((is_next_steps or is_short_generic) and has_file):
|
||||
return "EXPLAIN_LOCAL"
|
||||
if "tests" in negations and not has_user_anchor and (has_file or has_symbol):
|
||||
return "EXPLAIN_LOCAL"
|
||||
return candidate
|
||||
|
||||
def _looks_like_file(self, value: str) -> bool:
|
||||
tail = (value or "").rsplit("/", 1)[-1]
|
||||
return "." in tail
|
||||
111
app/modules/rag/intent_router_v2/retrieval_filter_builder.py
Normal file
111
app/modules/rag/intent_router_v2/retrieval_filter_builder.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.models import (
|
||||
CodeRetrievalFilters,
|
||||
ConversationState,
|
||||
DocsRetrievalFilters,
|
||||
HybridRetrievalFilters,
|
||||
QueryAnchor,
|
||||
RepoContext,
|
||||
)
|
||||
from app.modules.rag.intent_router_v2.test_signals import has_test_focus, is_negative_test_request, is_test_related_token
|
||||
|
||||
|
||||
class RetrievalFilterBuilder:
|
||||
def build(
|
||||
self,
|
||||
domains: list[str],
|
||||
anchors: list[QueryAnchor],
|
||||
repo_context: RepoContext,
|
||||
*,
|
||||
raw_query: str,
|
||||
conversation_state: ConversationState | None,
|
||||
conversation_mode: str,
|
||||
sub_intent: str = "EXPLAIN",
|
||||
) -> CodeRetrievalFilters | DocsRetrievalFilters | HybridRetrievalFilters:
|
||||
path_scope = self._path_scope(
|
||||
anchors,
|
||||
conversation_state=conversation_state,
|
||||
conversation_mode=conversation_mode,
|
||||
raw_query=raw_query,
|
||||
sub_intent=sub_intent,
|
||||
)
|
||||
if domains == ["DOCS"]:
|
||||
return DocsRetrievalFilters(
|
||||
path_scope=path_scope,
|
||||
doc_kinds=self._doc_kinds(anchors, raw_query),
|
||||
doc_language=[],
|
||||
)
|
||||
if domains == ["CODE"]:
|
||||
return CodeRetrievalFilters(
|
||||
test_policy=self._test_policy(raw_query, anchors),
|
||||
path_scope=path_scope,
|
||||
language=list(repo_context.languages),
|
||||
)
|
||||
return HybridRetrievalFilters(
|
||||
test_policy=self._test_policy(raw_query, anchors),
|
||||
path_scope=path_scope,
|
||||
language=list(repo_context.languages),
|
||||
doc_kinds=self._doc_kinds(anchors, raw_query),
|
||||
doc_language=[],
|
||||
)
|
||||
|
||||
def _test_policy(self, raw_query: str, anchors: list[QueryAnchor]) -> str:
|
||||
if is_negative_test_request(raw_query):
|
||||
return "EXCLUDE"
|
||||
if has_test_focus(raw_query):
|
||||
return "INCLUDE"
|
||||
has_test_keyterm = any(anchor.type == "KEY_TERM" and is_test_related_token(anchor.value) for anchor in anchors)
|
||||
return "INCLUDE" if has_test_keyterm else "EXCLUDE"
|
||||
|
||||
def _path_scope(
|
||||
self,
|
||||
anchors: list[QueryAnchor],
|
||||
*,
|
||||
conversation_state: ConversationState | None,
|
||||
conversation_mode: str,
|
||||
raw_query: str,
|
||||
sub_intent: str,
|
||||
) -> list[str]:
|
||||
values: list[str] = []
|
||||
has_user_file_anchor = False
|
||||
file_values: list[str] = []
|
||||
for anchor in anchors:
|
||||
if anchor.type != "FILE_PATH":
|
||||
continue
|
||||
if anchor.source == "user_text":
|
||||
has_user_file_anchor = True
|
||||
if anchor.value not in values:
|
||||
values.append(anchor.value)
|
||||
if self._looks_like_file_path(anchor.value) and anchor.value not in file_values:
|
||||
file_values.append(anchor.value)
|
||||
parent = anchor.value.rsplit("/", 1)[0] if "/" in anchor.value and self._looks_like_file_path(anchor.value) else ""
|
||||
if parent and parent not in values:
|
||||
values.append(parent)
|
||||
if sub_intent in {"OPEN_FILE", "EXPLAIN_LOCAL"} and file_values and not self._is_explicit_directory_scope(raw_query):
|
||||
return file_values[:6]
|
||||
if has_user_file_anchor or conversation_mode != "CONTINUE":
|
||||
return values[:6]
|
||||
if values:
|
||||
return values[:6]
|
||||
inherited = list((conversation_state.active_path_scope if conversation_state else []) or [])
|
||||
return inherited[:6]
|
||||
|
||||
def _doc_kinds(self, anchors: list[QueryAnchor], raw_query: str) -> list[str]:
|
||||
text = (raw_query or "").lower()
|
||||
kinds: list[str] = []
|
||||
has_readme = "readme" in text or any(
|
||||
anchor.type in {"DOC_REF", "FILE_PATH"} and anchor.value.lower().endswith("readme.md")
|
||||
for anchor in anchors
|
||||
)
|
||||
if has_readme:
|
||||
kinds.append("README")
|
||||
return kinds
|
||||
|
||||
def _looks_like_file_path(self, value: str) -> bool:
|
||||
filename = value.rsplit("/", 1)[-1]
|
||||
return "." in filename
|
||||
|
||||
def _is_explicit_directory_scope(self, raw_query: str) -> bool:
|
||||
text = (raw_query or "").lower()
|
||||
return any(marker in text for marker in ("в папке", "в директории", "в каталоге"))
|
||||
118
app/modules/rag/intent_router_v2/retrieval_spec_factory.py
Normal file
118
app/modules/rag/intent_router_v2/retrieval_spec_factory.py
Normal file
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.contracts.enums import RagLayer
|
||||
from app.modules.rag.intent_router_v2.layer_query_builder import LayerQueryBuilder
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, QueryAnchor, RepoContext, RetrievalSpec
|
||||
from app.modules.rag.intent_router_v2.retrieval_filter_builder import RetrievalFilterBuilder
|
||||
|
||||
|
||||
class RetrievalSpecFactory:
|
||||
_LAYERS = {
|
||||
"CODE_QA": [
|
||||
(RagLayer.CODE_ENTRYPOINTS, 6),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 8),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 6),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 8),
|
||||
],
|
||||
"DOCS_QA": [
|
||||
(RagLayer.DOCS_MODULE_CATALOG, 5),
|
||||
(RagLayer.DOCS_FACT_INDEX, 8),
|
||||
(RagLayer.DOCS_SECTION_INDEX, 8),
|
||||
(RagLayer.DOCS_POLICY_INDEX, 4),
|
||||
],
|
||||
"GENERATE_DOCS_FROM_CODE": [
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 12),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 8),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_ENTRYPOINTS, 6),
|
||||
],
|
||||
"PROJECT_MISC": [
|
||||
(RagLayer.DOCS_MODULE_CATALOG, 4),
|
||||
(RagLayer.DOCS_SECTION_INDEX, 6),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 4),
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 4),
|
||||
],
|
||||
}
|
||||
_DOMAINS = {
|
||||
"CODE_QA": ["CODE"],
|
||||
"DOCS_QA": ["DOCS"],
|
||||
"GENERATE_DOCS_FROM_CODE": ["CODE"],
|
||||
"PROJECT_MISC": ["CODE", "DOCS"],
|
||||
}
|
||||
_RERANK = {
|
||||
"CODE_QA": "code",
|
||||
"DOCS_QA": "docs",
|
||||
"GENERATE_DOCS_FROM_CODE": "generate",
|
||||
"PROJECT_MISC": "project",
|
||||
}
|
||||
_OPEN_FILE_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
]
|
||||
_OPEN_FILE_WITH_SYMBOL_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 6),
|
||||
]
|
||||
_EXPLAIN_LOCAL_LAYERS = [
|
||||
(RagLayer.CODE_SOURCE_CHUNKS, 12),
|
||||
(RagLayer.CODE_SYMBOL_CATALOG, 8),
|
||||
(RagLayer.CODE_DEPENDENCY_GRAPH, 4),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer_builder: LayerQueryBuilder | None = None,
|
||||
filter_builder: RetrievalFilterBuilder | None = None,
|
||||
) -> None:
|
||||
self._layer_builder = layer_builder or LayerQueryBuilder()
|
||||
self._filter_builder = filter_builder or RetrievalFilterBuilder()
|
||||
|
||||
def build(
|
||||
self,
|
||||
intent: str,
|
||||
anchors: list[QueryAnchor],
|
||||
repo_context: RepoContext,
|
||||
*,
|
||||
raw_query: str = "",
|
||||
conversation_state: ConversationState | None = None,
|
||||
conversation_mode: str = "START",
|
||||
sub_intent: str = "EXPLAIN",
|
||||
) -> RetrievalSpec:
|
||||
domains = self._domains(intent, repo_context)
|
||||
layers_map = self._with_sub_intent_layers(intent, sub_intent, anchors)
|
||||
layer_queries = self._layer_builder.build(intent, repo_context, domains=domains, layers_map=layers_map)
|
||||
filters = self._filter_builder.build(
|
||||
domains,
|
||||
anchors,
|
||||
repo_context,
|
||||
raw_query=raw_query,
|
||||
conversation_state=conversation_state,
|
||||
conversation_mode=conversation_mode,
|
||||
sub_intent=sub_intent,
|
||||
)
|
||||
return RetrievalSpec(
|
||||
domains=domains,
|
||||
layer_queries=layer_queries,
|
||||
filters=filters,
|
||||
rerank_profile=self._RERANK[intent],
|
||||
)
|
||||
|
||||
def _domains(self, intent: str, repo_context: RepoContext) -> list[str]:
|
||||
available = set(repo_context.available_domains or ["CODE", "DOCS"])
|
||||
result = [domain for domain in self._DOMAINS[intent] if domain in available]
|
||||
return result or list(self._DOMAINS[intent])
|
||||
|
||||
def _with_sub_intent_layers(
|
||||
self,
|
||||
intent: str,
|
||||
sub_intent: str,
|
||||
anchors: list[QueryAnchor],
|
||||
) -> dict[str, list[tuple[str, int]]]:
|
||||
if intent != "CODE_QA":
|
||||
return self._LAYERS
|
||||
layers_map = dict(self._LAYERS)
|
||||
if sub_intent == "OPEN_FILE":
|
||||
has_symbol = any(anchor.type == "SYMBOL" and anchor.source == "user_text" for anchor in anchors)
|
||||
layers_map["CODE_QA"] = list(self._OPEN_FILE_WITH_SYMBOL_LAYERS if has_symbol else self._OPEN_FILE_LAYERS)
|
||||
elif sub_intent == "EXPLAIN_LOCAL":
|
||||
layers_map["CODE_QA"] = list(self._EXPLAIN_LOCAL_LAYERS)
|
||||
return layers_map
|
||||
72
app/modules/rag/intent_router_v2/router.py
Normal file
72
app/modules/rag/intent_router_v2/router.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.intent_router_v2.classifier import IntentClassifierV2
|
||||
from app.modules.rag.intent_router_v2.conversation_policy import ConversationPolicy
|
||||
from app.modules.rag.intent_router_v2.evidence_policy_factory import EvidencePolicyFactory
|
||||
from app.modules.rag.intent_router_v2.graph_id_resolver import GraphIdResolver
|
||||
from app.modules.rag.intent_router_v2.logger import IntentRouterLogger
|
||||
from app.modules.rag.intent_router_v2.models import ConversationState, IntentRouterResult, RepoContext
|
||||
from app.modules.rag.intent_router_v2.query_plan_builder import QueryPlanBuilder
|
||||
from app.modules.rag.intent_router_v2.retrieval_spec_factory import RetrievalSpecFactory
|
||||
|
||||
|
||||
class IntentRouterV2:
|
||||
def __init__(
|
||||
self,
|
||||
classifier: IntentClassifierV2 | None = None,
|
||||
conversation_policy: ConversationPolicy | None = None,
|
||||
query_plan_builder: QueryPlanBuilder | None = None,
|
||||
retrieval_factory: RetrievalSpecFactory | None = None,
|
||||
evidence_factory: EvidencePolicyFactory | None = None,
|
||||
graph_resolver: GraphIdResolver | None = None,
|
||||
logger: IntentRouterLogger | None = None,
|
||||
) -> None:
|
||||
self._classifier = classifier or IntentClassifierV2()
|
||||
self._conversation_policy = conversation_policy or ConversationPolicy()
|
||||
self._query_plan_builder = query_plan_builder or QueryPlanBuilder()
|
||||
self._retrieval_factory = retrieval_factory or RetrievalSpecFactory()
|
||||
self._evidence_factory = evidence_factory or EvidencePolicyFactory()
|
||||
self._graph_resolver = graph_resolver or GraphIdResolver()
|
||||
self._logger = logger or IntentRouterLogger()
|
||||
|
||||
def route(
|
||||
self,
|
||||
user_query: str,
|
||||
conversation_state: ConversationState | None = None,
|
||||
repo_context: RepoContext | None = None,
|
||||
) -> IntentRouterResult:
|
||||
state = conversation_state or ConversationState()
|
||||
context = repo_context or RepoContext()
|
||||
self._logger.log_request(user_query, state, context)
|
||||
decision = self._classifier.classify(user_query, state)
|
||||
intent, conversation_mode = self._conversation_policy.resolve(decision, user_query, state)
|
||||
query_plan = self._query_plan_builder.build(
|
||||
user_query,
|
||||
state,
|
||||
continue_mode=conversation_mode == "CONTINUE",
|
||||
conversation_mode=conversation_mode,
|
||||
intent=intent,
|
||||
)
|
||||
result = IntentRouterResult(
|
||||
intent=intent,
|
||||
graph_id=self._graph_resolver.resolve(intent),
|
||||
conversation_mode=conversation_mode,
|
||||
query_plan=query_plan,
|
||||
retrieval_spec=self._retrieval_factory.build(
|
||||
intent,
|
||||
query_plan.anchors,
|
||||
context,
|
||||
raw_query=query_plan.raw,
|
||||
conversation_state=state,
|
||||
conversation_mode=conversation_mode,
|
||||
sub_intent=query_plan.sub_intent,
|
||||
),
|
||||
evidence_policy=self._evidence_factory.build(
|
||||
intent,
|
||||
sub_intent=query_plan.sub_intent,
|
||||
negations=query_plan.negations,
|
||||
has_user_anchor=any(anchor.source == "user_text" for anchor in query_plan.anchors),
|
||||
),
|
||||
)
|
||||
self._logger.log_result(result)
|
||||
return result
|
||||
23
app/modules/rag/intent_router_v2/sub_intent_detector.py
Normal file
23
app/modules/rag/intent_router_v2/sub_intent_detector.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class SubIntentDetector:
|
||||
_OPEN_VERBS = ("открой", "посмотри", "проверь", "уточни")
|
||||
_EXPLAIN_MARKERS = ("объясни", "как работает", "почему", "что делает", "зачем", "логика", "флоу", "flow")
|
||||
_TEST_MARKERS = ("тест", "pytest", "unit test", "юнит")
|
||||
|
||||
def detect(self, raw: str, *, has_file_path: bool, negations: set[str]) -> str:
|
||||
text = " ".join((raw or "").lower().split())
|
||||
if not text:
|
||||
return "EXPLAIN"
|
||||
if has_file_path and self._has_open_verb(text) and not self._has_explain_markers(text):
|
||||
return "OPEN_FILE"
|
||||
if "tests" not in negations and any(marker in text for marker in self._TEST_MARKERS):
|
||||
return "FIND_TESTS"
|
||||
return "EXPLAIN"
|
||||
|
||||
def _has_open_verb(self, text: str) -> bool:
|
||||
return any(text.startswith(verb) or f" {verb} " in f" {text} " for verb in self._OPEN_VERBS)
|
||||
|
||||
def _has_explain_markers(self, text: str) -> bool:
|
||||
return any(marker in text for marker in self._EXPLAIN_MARKERS)
|
||||
51
app/modules/rag/intent_router_v2/symbol_rules.py
Normal file
51
app/modules/rag/intent_router_v2/symbol_rules.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
PY_KEYWORDS: set[str] = {
|
||||
"and",
|
||||
"as",
|
||||
"assert",
|
||||
"async",
|
||||
"await",
|
||||
"break",
|
||||
"class",
|
||||
"continue",
|
||||
"def",
|
||||
"del",
|
||||
"elif",
|
||||
"else",
|
||||
"except",
|
||||
"false",
|
||||
"finally",
|
||||
"for",
|
||||
"from",
|
||||
"global",
|
||||
"if",
|
||||
"import",
|
||||
"in",
|
||||
"is",
|
||||
"lambda",
|
||||
"none",
|
||||
"nonlocal",
|
||||
"not",
|
||||
"or",
|
||||
"pass",
|
||||
"raise",
|
||||
"return",
|
||||
"true",
|
||||
"try",
|
||||
"while",
|
||||
"with",
|
||||
"yield",
|
||||
}
|
||||
|
||||
COMMON_PATH_SEGMENTS: set[str] = {
|
||||
"app",
|
||||
"src",
|
||||
"docs",
|
||||
"tests",
|
||||
"module",
|
||||
"modules",
|
||||
"core",
|
||||
"pkg",
|
||||
"lib",
|
||||
}
|
||||
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||||
|
||||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||||
|
||||
|
||||
class RuEnTermMapper:
|
||||
_CANONICAL_MAP = {
|
||||
"класс": ["class"],
|
||||
"метод": ["method"],
|
||||
"функция": ["function", "def"],
|
||||
"модуль": ["module"],
|
||||
"пакет": ["package"],
|
||||
"файл": ["file"],
|
||||
"тест": ["test", "unit test"],
|
||||
"документация": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
}
|
||||
_ENGLISH_SOURCES = {
|
||||
"class": ["class"],
|
||||
"method": ["method"],
|
||||
"function": ["function", "def"],
|
||||
"module": ["module"],
|
||||
"package": ["package"],
|
||||
"file": ["file"],
|
||||
"test": ["test", "unit test"],
|
||||
"tests": ["test", "unit test"],
|
||||
"documentation": ["documentation", "docs"],
|
||||
"docs": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
"def": ["def"],
|
||||
}
|
||||
|
||||
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
|
||||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||||
|
||||
def expand(self, text: str) -> list[str]:
|
||||
expansions: list[str] = []
|
||||
lowered = (text or "").lower()
|
||||
for token in _WORD_RE.findall(lowered):
|
||||
canonical = self._canonicalizer.canonicalize(token) or token
|
||||
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
|
||||
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
|
||||
if "unit test" in lowered or "unit tests" in lowered:
|
||||
self._extend(expansions, self._ENGLISH_SOURCES["test"])
|
||||
return expansions
|
||||
|
||||
def key_terms(self) -> tuple[str, ...]:
|
||||
return tuple(self._CANONICAL_MAP.keys())
|
||||
|
||||
def all_literal_terms(self) -> tuple[str, ...]:
|
||||
values = set(self._canonicalizer.aliases())
|
||||
values.update(self._CANONICAL_MAP.keys())
|
||||
values.update(self._ENGLISH_SOURCES.keys())
|
||||
for targets in self._CANONICAL_MAP.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
for targets in self._ENGLISH_SOURCES.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
return tuple(sorted(values))
|
||||
|
||||
def _extend(self, result: list[str], values: list[str]) -> None:
|
||||
for value in values:
|
||||
if value not in result:
|
||||
result.append(value)
|
||||
40
app/modules/rag/intent_router_v2/test_signals.py
Normal file
40
app/modules/rag/intent_router_v2/test_signals.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_NEGATIVE_TEST_RE = re.compile(r"\b(?:не|без|кроме)\b[^.?!]{0,28}\bтест", re.IGNORECASE)
|
||||
_NEGATIVE_TEST_MARKERS = ("не про тест", "без тест", "кроме тест", "про прод код", "только прод", "production code")
|
||||
_POSITIVE_TEST_MARKERS = (
|
||||
"тест",
|
||||
"tests",
|
||||
"pytest",
|
||||
"unit test",
|
||||
"unit tests",
|
||||
"тестиру",
|
||||
)
|
||||
_TEST_TERMS = {"тест", "тесты", "test", "tests", "pytest", "unit", "unit test", "юнит-тест", "юниттест"}
|
||||
|
||||
|
||||
def is_negative_test_request(text: str) -> bool:
|
||||
lowered = (text or "").lower()
|
||||
if _NEGATIVE_TEST_RE.search(lowered):
|
||||
return True
|
||||
return any(marker in lowered for marker in _NEGATIVE_TEST_MARKERS)
|
||||
|
||||
|
||||
def has_test_focus(text: str) -> bool:
|
||||
lowered = (text or "").lower()
|
||||
if is_negative_test_request(lowered):
|
||||
return False
|
||||
return any(marker in lowered for marker in _POSITIVE_TEST_MARKERS)
|
||||
|
||||
|
||||
def is_test_related_token(value: str) -> bool:
|
||||
lowered = (value or "").lower().strip()
|
||||
if not lowered:
|
||||
return False
|
||||
if lowered in _TEST_TERMS:
|
||||
return True
|
||||
if lowered.startswith("test"):
|
||||
return True
|
||||
return lowered.startswith("тест")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -46,7 +46,6 @@ class RagDocumentRepository:
|
||||
for doc in docs:
|
||||
row = doc.to_record()
|
||||
metadata = row["metadata"]
|
||||
links = row["links"]
|
||||
emb = row["embedding"] or []
|
||||
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
||||
conn.execute(
|
||||
@@ -55,15 +54,15 @@ class RagDocumentRepository:
|
||||
INSERT INTO rag_chunks (
|
||||
rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id,
|
||||
doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at,
|
||||
rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start,
|
||||
span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
|
||||
layer, lang, repo_id, commit_sha, title, metadata_json, span_start, span_end, symbol_id,
|
||||
qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind
|
||||
)
|
||||
VALUES (
|
||||
:sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id,
|
||||
:doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP,
|
||||
CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
|
||||
:links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type,
|
||||
:module_id, :section_path, :doc_kind
|
||||
CURRENT_TIMESTAMP, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json,
|
||||
:span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type, :module_id,
|
||||
:section_path, :doc_kind
|
||||
)
|
||||
"""
|
||||
),
|
||||
@@ -81,14 +80,12 @@ class RagDocumentRepository:
|
||||
"system_component": metadata.get("system_component"),
|
||||
"last_modified": metadata.get("last_modified"),
|
||||
"staleness_score": metadata.get("staleness_score"),
|
||||
"rag_doc_id": row["doc_id"],
|
||||
"layer": row["layer"],
|
||||
"lang": row["lang"],
|
||||
"repo_id": row["repo_id"],
|
||||
"commit_sha": row["commit_sha"],
|
||||
"title": row["title"],
|
||||
"metadata_json": json.dumps(metadata, ensure_ascii=True),
|
||||
"links_json": json.dumps(links, ensure_ascii=True),
|
||||
"span_start": row["span_start"],
|
||||
"span_end": row["span_end"],
|
||||
"symbol_id": metadata.get("symbol_id"),
|
||||
|
||||
@@ -4,11 +4,14 @@ import json
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.modules.rag.retrieval.query_terms import extract_query_terms
|
||||
from app.modules.rag.persistence.retrieval_statement_builder import RetrievalStatementBuilder
|
||||
from app.modules.shared.db import get_engine
|
||||
|
||||
|
||||
class RagQueryRepository:
|
||||
def __init__(self) -> None:
|
||||
self._builder = RetrievalStatementBuilder()
|
||||
|
||||
def retrieve(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
@@ -18,89 +21,47 @@ class RagQueryRepository:
|
||||
limit: int = 5,
|
||||
layers: list[str] | None = None,
|
||||
path_prefixes: list[str] | None = None,
|
||||
exclude_path_prefixes: list[str] | None = None,
|
||||
exclude_like_patterns: list[str] | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
||||
filters = ["rag_session_id = :sid"]
|
||||
params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit}
|
||||
if layers:
|
||||
filters.append("layer = ANY(:layers)")
|
||||
params["layers"] = layers
|
||||
if path_prefixes:
|
||||
or_filters = []
|
||||
for idx, prefix in enumerate(path_prefixes):
|
||||
key = f"path_{idx}"
|
||||
params[key] = f"{prefix}%"
|
||||
or_filters.append(f"path LIKE :{key}")
|
||||
filters.append("(" + " OR ".join(or_filters) + ")")
|
||||
term_filters = []
|
||||
terms = extract_query_terms(query_text)
|
||||
for idx, term in enumerate(terms):
|
||||
exact_key = f"term_exact_{idx}"
|
||||
prefix_key = f"term_prefix_{idx}"
|
||||
contains_key = f"term_contains_{idx}"
|
||||
params[exact_key] = term
|
||||
params[prefix_key] = f"{term}%"
|
||||
params[contains_key] = f"%{term}%"
|
||||
term_filters.append(
|
||||
"CASE "
|
||||
f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 "
|
||||
f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 "
|
||||
f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 "
|
||||
f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 "
|
||||
f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 "
|
||||
f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 "
|
||||
f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 "
|
||||
"ELSE 100 END"
|
||||
)
|
||||
lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100"
|
||||
test_penalty_sql = (
|
||||
"CASE "
|
||||
"WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' "
|
||||
"THEN 1 ELSE 0 END"
|
||||
if prefer_non_tests
|
||||
else "0"
|
||||
sql, params = self._builder.build_retrieve(
|
||||
rag_session_id,
|
||||
query_embedding,
|
||||
query_text=query_text,
|
||||
limit=limit,
|
||||
layers=layers,
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=exclude_path_prefixes,
|
||||
exclude_like_patterns=exclude_like_patterns,
|
||||
prefer_non_tests=prefer_non_tests,
|
||||
)
|
||||
layer_rank_sql = (
|
||||
"CASE "
|
||||
"WHEN layer = 'C3_ENTRYPOINTS' THEN 0 "
|
||||
"WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 "
|
||||
"WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 "
|
||||
"WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 "
|
||||
"WHEN layer = 'D1_MODULE_CATALOG' THEN 0 "
|
||||
"WHEN layer = 'D2_FACT_INDEX' THEN 1 "
|
||||
"WHEN layer = 'D3_SECTION_INDEX' THEN 2 "
|
||||
"WHEN layer = 'D4_POLICY_INDEX' THEN 3 "
|
||||
"ELSE 10 END"
|
||||
)
|
||||
sql = f"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end,
|
||||
{lexical_sql} AS lexical_rank,
|
||||
{test_penalty_sql} AS test_penalty,
|
||||
{layer_rank_sql} AS layer_rank,
|
||||
(embedding <=> CAST(:emb AS vector)) AS distance
|
||||
FROM rag_chunks
|
||||
WHERE {' AND '.join(filters)}
|
||||
ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector)
|
||||
LIMIT :lim
|
||||
"""
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]:
|
||||
filters = ["rag_session_id = :sid"]
|
||||
params: dict = {"sid": rag_session_id, "lim": limit}
|
||||
if layers:
|
||||
filters.append("layer = ANY(:layers)")
|
||||
params["layers"] = layers
|
||||
sql = f"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end
|
||||
FROM rag_chunks
|
||||
WHERE {' AND '.join(filters)}
|
||||
ORDER BY id DESC
|
||||
LIMIT :lim
|
||||
"""
|
||||
def retrieve_lexical_code(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
*,
|
||||
query_text: str,
|
||||
limit: int = 5,
|
||||
path_prefixes: list[str] | None = None,
|
||||
exclude_path_prefixes: list[str] | None = None,
|
||||
exclude_like_patterns: list[str] | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
sql, params = self._builder.build_lexical_code(
|
||||
rag_session_id,
|
||||
query_text=query_text,
|
||||
limit=limit,
|
||||
path_prefixes=path_prefixes,
|
||||
exclude_path_prefixes=exclude_path_prefixes,
|
||||
exclude_like_patterns=exclude_like_patterns,
|
||||
prefer_non_tests=prefer_non_tests,
|
||||
)
|
||||
if sql is None:
|
||||
return []
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(text(sql), params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user