Фиксируем состояние
This commit is contained in:
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.evidence_gate import EvidenceGateDecision
|
||||
from app.modules.rag.intent_router_v2.models import IntentRouterResult
|
||||
from app.modules.agent.code_qa_runtime.short_answer_formatter import CodeQaShortAnswerFormatter
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
@@ -16,6 +17,9 @@ class CodeQaPolicyDecision:
|
||||
|
||||
|
||||
class CodeQaAnswerPolicy:
|
||||
def __init__(self, formatter: CodeQaShortAnswerFormatter | None = None) -> None:
|
||||
self._formatter = formatter or CodeQaShortAnswerFormatter()
|
||||
|
||||
def decide(
|
||||
self,
|
||||
*,
|
||||
@@ -29,34 +33,39 @@ class CodeQaAnswerPolicy:
|
||||
target = path_scope[0] if path_scope else "запрошенный файл"
|
||||
return CodeQaPolicyDecision(
|
||||
answer_mode="not_found",
|
||||
answer=f"Файл {target} не найден.",
|
||||
answer=self._formatter.open_file_not_found(target),
|
||||
should_call_llm=False,
|
||||
branch="open_file_not_found",
|
||||
reason="path_scope_empty",
|
||||
)
|
||||
if sub_intent == "EXPLAIN" and symbol_resolution.status in {"not_found", "ambiguous"}:
|
||||
if sub_intent == "EXPLAIN" and symbol_resolution.status == "not_found":
|
||||
return CodeQaPolicyDecision(
|
||||
answer_mode="degraded",
|
||||
answer=self._symbol_message(symbol_resolution.status, symbol_resolution.alternatives),
|
||||
answer_mode="not_found",
|
||||
answer=self._formatter.entity_not_found(self._target_label(router_result), symbol_resolution.alternatives),
|
||||
should_call_llm=False,
|
||||
branch="explain_unresolved_symbol",
|
||||
reason=f"symbol_resolution_{symbol_resolution.status}",
|
||||
branch="explain_not_found",
|
||||
reason="symbol_resolution_not_found",
|
||||
)
|
||||
if sub_intent == "EXPLAIN" and symbol_resolution.status == "ambiguous":
|
||||
return CodeQaPolicyDecision(
|
||||
answer_mode="ambiguous",
|
||||
answer=self._formatter.entity_ambiguous(self._target_label(router_result), symbol_resolution.alternatives),
|
||||
should_call_llm=False,
|
||||
branch="explain_ambiguous_symbol",
|
||||
reason="symbol_resolution_ambiguous",
|
||||
)
|
||||
if not gate_decision.passed:
|
||||
answer_mode = "insufficient" if "insufficient_evidence" in gate_decision.failure_reasons else "degraded"
|
||||
reason = gate_decision.failure_reasons[0] if gate_decision.failure_reasons else "evidence_gate_failed"
|
||||
return CodeQaPolicyDecision(
|
||||
answer_mode=answer_mode,
|
||||
answer=gate_decision.degraded_message,
|
||||
answer=self._formatter.insufficient(gate_decision.degraded_message),
|
||||
should_call_llm=False,
|
||||
branch="evidence_gate_short_circuit",
|
||||
reason=reason,
|
||||
)
|
||||
return CodeQaPolicyDecision(answer_mode="normal", branch="normal_answer", reason="evidence_sufficient")
|
||||
|
||||
def _symbol_message(self, status: str, alternatives: list[str]) -> str:
|
||||
if status == "ambiguous" and alternatives:
|
||||
return f"Сущность не удалось однозначно разрешить. Близкие варианты: {', '.join(alternatives[:3])}."
|
||||
if alternatives:
|
||||
return f"Сущность не найдена в доступном коде. Ближайшие варианты: {', '.join(alternatives[:3])}."
|
||||
return "Сущность не найдена в доступном коде."
|
||||
def _target_label(self, router_result: IntentRouterResult) -> str:
|
||||
candidates = [item.strip() for item in list(router_result.query_plan.symbol_candidates or []) if item and item.strip()]
|
||||
return candidates[0] if candidates else "запрошенная сущность"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from difflib import SequenceMatcher
|
||||
from time import perf_counter
|
||||
|
||||
from app.modules.agent.code_qa_runtime.answer_policy import CodeQaAnswerPolicy
|
||||
@@ -50,6 +51,9 @@ class CodeQaRuntimeExecutor:
|
||||
def execute(self, *, user_query: str, rag_session_id: str, files_map: dict[str, dict] | None = None) -> CodeQaFinalResult:
|
||||
timings_ms: dict[str, int] = {}
|
||||
runtime_trace: list[dict] = []
|
||||
answer_policy_branch = ""
|
||||
decision_reason = ""
|
||||
post_gate_snapshot: dict = {}
|
||||
state = CodeQaExecutionState(
|
||||
user_query=user_query,
|
||||
rag_session_id=rag_session_id,
|
||||
@@ -76,9 +80,10 @@ class CodeQaRuntimeExecutor:
|
||||
started = perf_counter()
|
||||
raw_rows = self._retrieve(state)
|
||||
timings_ms["retrieval"] = self._elapsed_ms(started)
|
||||
retrieval_report = self._retrieval.consume_retrieval_report() or {}
|
||||
raw_rows, retrieval_report = self._hydrate_entrypoint_sources(state, raw_rows, retrieval_report)
|
||||
symbol_resolution = self._resolve_symbol(state.router_result.symbol_resolution.model_dump(), raw_rows)
|
||||
state.router_result = state.router_result.model_copy(update={"symbol_resolution": SymbolResolution(**symbol_resolution)})
|
||||
retrieval_report = self._retrieval.consume_retrieval_report()
|
||||
state.retrieval_result = build_retrieval_result(raw_rows, retrieval_report, symbol_resolution)
|
||||
if state.retrieval_request.sub_intent.upper() == "EXPLAIN" and symbol_resolution.get("status") in {"not_found", "ambiguous"}:
|
||||
state.retrieval_result = build_retrieval_result([], retrieval_report, symbol_resolution)
|
||||
@@ -101,8 +106,10 @@ class CodeQaRuntimeExecutor:
|
||||
if state.retrieval_request.sub_intent.upper() == "EXPLAIN" and symbol_resolution.get("status") in {"not_found", "ambiguous"}:
|
||||
state.evidence_pack.sufficient = False
|
||||
state.evidence_pack.failure_reasons = ["target_not_resolved"]
|
||||
pre_gate_input = self._build_pre_gate_input(state)
|
||||
started = perf_counter()
|
||||
gate_decision = evaluate_evidence(state.evidence_pack)
|
||||
timings_ms["pre_evidence_gate"] = 0
|
||||
timings_ms["pre_evidence_gate"] = self._elapsed_ms(started)
|
||||
state.answer_mode = "normal" if gate_decision.passed else "degraded"
|
||||
state.degraded_message = gate_decision.degraded_message
|
||||
runtime_trace.append(
|
||||
@@ -110,31 +117,51 @@ class CodeQaRuntimeExecutor:
|
||||
"step": "pre_evidence_gate",
|
||||
"status": "passed" if gate_decision.passed else "blocked",
|
||||
"timings_ms": {"pre_evidence_gate": timings_ms["pre_evidence_gate"]},
|
||||
"input": pre_gate_input,
|
||||
"output": {
|
||||
"passed": gate_decision.passed,
|
||||
"failure_reasons": list(gate_decision.failure_reasons),
|
||||
"degraded_message": gate_decision.degraded_message,
|
||||
"evidence_count": state.evidence_pack.evidence_count,
|
||||
},
|
||||
}
|
||||
)
|
||||
decision = self._answer_policy.decide(router_result=state.router_result, gate_decision=gate_decision)
|
||||
answer_policy_branch = decision.branch
|
||||
decision_reason = decision.reason
|
||||
if not decision.should_call_llm:
|
||||
state.answer_mode = decision.answer_mode
|
||||
started = perf_counter()
|
||||
runtime_trace.append(
|
||||
{
|
||||
"step": "llm",
|
||||
"status": "skipped",
|
||||
"timings_ms": {"llm": self._elapsed_ms(started)},
|
||||
"output": {
|
||||
"reason": "policy_short_circuit",
|
||||
"answer_mode": decision.answer_mode,
|
||||
"decision_reason": decision.reason,
|
||||
"answer_policy_branch": decision.branch,
|
||||
},
|
||||
}
|
||||
)
|
||||
started = perf_counter()
|
||||
timings_ms["post_evidence_gate"] = self._elapsed_ms(started)
|
||||
post_gate_snapshot = {
|
||||
"input": {
|
||||
"answer_mode": decision.answer_mode,
|
||||
"draft_present": False,
|
||||
"resolved_target": self._resolved_target(state),
|
||||
},
|
||||
"output": {"reason": "no_draft_answer"},
|
||||
}
|
||||
runtime_trace.append(
|
||||
{
|
||||
"step": "post_evidence_gate",
|
||||
"status": "skipped",
|
||||
"output": {"reason": "no_draft_answer"},
|
||||
"timings_ms": {"post_evidence_gate": timings_ms["post_evidence_gate"]},
|
||||
"input": post_gate_snapshot["input"],
|
||||
"output": post_gate_snapshot["output"],
|
||||
}
|
||||
)
|
||||
return self._finalize(
|
||||
@@ -145,13 +172,41 @@ class CodeQaRuntimeExecutor:
|
||||
llm_used=False,
|
||||
timings_ms=timings_ms,
|
||||
runtime_trace=runtime_trace,
|
||||
answer_policy_branch=answer_policy_branch,
|
||||
decision_reason=decision_reason,
|
||||
pre_gate_input=pre_gate_input,
|
||||
gate_decision=gate_decision,
|
||||
post_gate_snapshot=post_gate_snapshot,
|
||||
)
|
||||
if self._llm is None:
|
||||
answer_policy_branch = "llm_unavailable"
|
||||
decision_reason = "llm_service_missing"
|
||||
started = perf_counter()
|
||||
runtime_trace.append(
|
||||
{
|
||||
"step": "llm",
|
||||
"status": "skipped",
|
||||
"output": {"reason": "llm_unavailable"},
|
||||
"timings_ms": {"llm": self._elapsed_ms(started)},
|
||||
"output": {"reason": "llm_unavailable", "answer_policy_branch": answer_policy_branch, "decision_reason": decision_reason},
|
||||
}
|
||||
)
|
||||
started = perf_counter()
|
||||
timings_ms["post_evidence_gate"] = self._elapsed_ms(started)
|
||||
post_gate_snapshot = {
|
||||
"input": {
|
||||
"answer_mode": state.answer_mode,
|
||||
"draft_present": False,
|
||||
"resolved_target": self._resolved_target(state),
|
||||
},
|
||||
"output": {"reason": "no_draft_answer"},
|
||||
}
|
||||
runtime_trace.append(
|
||||
{
|
||||
"step": "post_evidence_gate",
|
||||
"status": "skipped",
|
||||
"timings_ms": {"post_evidence_gate": timings_ms["post_evidence_gate"]},
|
||||
"input": post_gate_snapshot["input"],
|
||||
"output": post_gate_snapshot["output"],
|
||||
}
|
||||
)
|
||||
return self._finalize(
|
||||
@@ -162,6 +217,11 @@ class CodeQaRuntimeExecutor:
|
||||
llm_used=False,
|
||||
timings_ms=timings_ms,
|
||||
runtime_trace=runtime_trace,
|
||||
answer_policy_branch=answer_policy_branch,
|
||||
decision_reason=decision_reason,
|
||||
pre_gate_input=pre_gate_input,
|
||||
gate_decision=gate_decision,
|
||||
post_gate_snapshot=post_gate_snapshot,
|
||||
)
|
||||
state.synthesis_input = build_answer_synthesis_input(user_query, state.evidence_pack)
|
||||
prompt_name = self._prompt_selector.select(sub_intent=state.retrieval_request.sub_intent, answer_mode=state.answer_mode)
|
||||
@@ -186,10 +246,29 @@ class CodeQaRuntimeExecutor:
|
||||
"output": {
|
||||
"prompt_name": prompt_name,
|
||||
"answer_preview": draft.answer[:300],
|
||||
"resolved_target": self._resolved_target(state),
|
||||
"answer_policy_branch": answer_policy_branch,
|
||||
"decision_reason": decision_reason,
|
||||
},
|
||||
}
|
||||
)
|
||||
validation = self._post_gate.validate(answer=draft.answer, answer_mode=state.answer_mode, degraded_message=state.degraded_message)
|
||||
post_gate_input = {
|
||||
"answer_mode": state.answer_mode,
|
||||
"degraded_message": state.degraded_message,
|
||||
"resolved_target": self._resolved_target(state),
|
||||
"draft_answer_preview": draft.answer[:300],
|
||||
"repair_candidate": bool(self._repair is not None),
|
||||
}
|
||||
started = perf_counter()
|
||||
validation = self._post_gate.validate(
|
||||
answer=draft.answer,
|
||||
answer_mode=state.answer_mode,
|
||||
degraded_message=state.degraded_message,
|
||||
sub_intent=state.retrieval_request.sub_intent,
|
||||
user_query=user_query,
|
||||
evidence_pack=state.evidence_pack,
|
||||
)
|
||||
timings_ms["post_evidence_gate"] = self._elapsed_ms(started)
|
||||
final_answer = draft.answer
|
||||
repair_used = False
|
||||
if not validation.passed and self._repair is not None:
|
||||
@@ -197,22 +276,40 @@ class CodeQaRuntimeExecutor:
|
||||
final_answer = self._repair.repair(draft_answer=draft.answer, validation=validation, prompt_payload=prompt_payload)
|
||||
repair_used = True
|
||||
timings_ms["repair"] = self._elapsed_ms(started)
|
||||
validation = self._post_gate.validate(answer=final_answer, answer_mode=state.answer_mode, degraded_message=state.degraded_message)
|
||||
if not validation.passed and state.degraded_message:
|
||||
final_answer = state.degraded_message
|
||||
started = perf_counter()
|
||||
validation = self._post_gate.validate(
|
||||
answer=final_answer,
|
||||
answer_mode=state.answer_mode,
|
||||
degraded_message=state.degraded_message,
|
||||
sub_intent=state.retrieval_request.sub_intent,
|
||||
user_query=user_query,
|
||||
evidence_pack=state.evidence_pack,
|
||||
)
|
||||
timings_ms["post_evidence_gate_recheck"] = self._elapsed_ms(started)
|
||||
if not validation.passed:
|
||||
final_answer = self._fallback_answer(state)
|
||||
state.answer_mode = self._fallback_mode(state)
|
||||
post_gate_snapshot = {
|
||||
"input": post_gate_input,
|
||||
"output": {
|
||||
"passed": validation.passed,
|
||||
"action": validation.action,
|
||||
"reasons": list(validation.reasons),
|
||||
"repair_used": repair_used,
|
||||
"final_answer_preview": final_answer[:300],
|
||||
},
|
||||
}
|
||||
runtime_trace.append(
|
||||
{
|
||||
"step": "post_evidence_gate",
|
||||
"status": "passed" if validation.passed else "failed",
|
||||
"timings_ms": {
|
||||
"post_evidence_gate": 0,
|
||||
"post_evidence_gate": timings_ms["post_evidence_gate"],
|
||||
"post_evidence_gate_recheck": timings_ms.get("post_evidence_gate_recheck", 0),
|
||||
"repair": timings_ms.get("repair", 0),
|
||||
},
|
||||
"output": {
|
||||
"passed": validation.passed,
|
||||
"reasons": list(validation.reasons),
|
||||
"repair_used": repair_used,
|
||||
},
|
||||
"input": post_gate_snapshot["input"],
|
||||
"output": post_gate_snapshot["output"],
|
||||
}
|
||||
)
|
||||
return self._finalize(
|
||||
@@ -224,6 +321,11 @@ class CodeQaRuntimeExecutor:
|
||||
validation=validation,
|
||||
timings_ms=timings_ms,
|
||||
runtime_trace=runtime_trace,
|
||||
answer_policy_branch=answer_policy_branch,
|
||||
decision_reason=decision_reason,
|
||||
pre_gate_input=pre_gate_input,
|
||||
gate_decision=gate_decision,
|
||||
post_gate_snapshot=post_gate_snapshot,
|
||||
)
|
||||
|
||||
def _retrieve(self, state: CodeQaExecutionState) -> list[dict]:
|
||||
@@ -257,9 +359,10 @@ class CodeQaRuntimeExecutor:
|
||||
exact = next((item for item in found if item in candidates), None)
|
||||
if exact:
|
||||
return {"status": "resolved", "resolved_symbol": exact, "alternatives": found[:5], "confidence": 0.99}
|
||||
if found:
|
||||
return {"status": "ambiguous", "resolved_symbol": None, "alternatives": found[:5], "confidence": 0.55}
|
||||
return {"status": "not_found", "resolved_symbol": None, "alternatives": [], "confidence": 0.0}
|
||||
close = self._close_matches(candidates, found)
|
||||
if close:
|
||||
return {"status": "ambiguous", "resolved_symbol": None, "alternatives": close[:5], "confidence": 0.55}
|
||||
return {"status": "not_found", "resolved_symbol": None, "alternatives": close[:5], "confidence": 0.0}
|
||||
|
||||
def _finalize(
|
||||
self,
|
||||
@@ -272,6 +375,11 @@ class CodeQaRuntimeExecutor:
|
||||
validation=None,
|
||||
timings_ms: dict[str, int] | None = None,
|
||||
runtime_trace: list[dict] | None = None,
|
||||
answer_policy_branch: str = "",
|
||||
decision_reason: str = "",
|
||||
pre_gate_input: dict | None = None,
|
||||
gate_decision=None,
|
||||
post_gate_snapshot: dict | None = None,
|
||||
) -> CodeQaFinalResult:
|
||||
diagnostics = build_diagnostics_report(
|
||||
router_result=state.router_result,
|
||||
@@ -280,6 +388,11 @@ class CodeQaRuntimeExecutor:
|
||||
evidence_bundle=state.evidence_pack,
|
||||
answer_mode=state.answer_mode,
|
||||
timings_ms=timings_ms or {},
|
||||
resolved_target=self._resolved_target(state),
|
||||
answer_policy_branch=answer_policy_branch,
|
||||
decision_reason=decision_reason,
|
||||
evidence_gate_input=pre_gate_input or {},
|
||||
post_evidence_gate=post_gate_snapshot or {},
|
||||
)
|
||||
result = CodeQaFinalResult(
|
||||
final_answer=final_answer.strip(),
|
||||
@@ -287,7 +400,15 @@ class CodeQaRuntimeExecutor:
|
||||
repair_used=repair_used,
|
||||
llm_used=llm_used,
|
||||
draft_answer=draft,
|
||||
validation=validation or self._post_gate.validate(answer=final_answer, answer_mode=state.answer_mode, degraded_message=state.degraded_message),
|
||||
validation=validation
|
||||
or self._post_gate.validate(
|
||||
answer=final_answer,
|
||||
answer_mode=state.answer_mode,
|
||||
degraded_message=state.degraded_message,
|
||||
sub_intent=state.retrieval_request.sub_intent if state.retrieval_request else "",
|
||||
user_query=state.user_query,
|
||||
evidence_pack=state.evidence_pack,
|
||||
),
|
||||
router_result=state.router_result,
|
||||
retrieval_request=state.retrieval_request,
|
||||
retrieval_result=state.retrieval_result,
|
||||
@@ -306,4 +427,119 @@ class CodeQaRuntimeExecutor:
|
||||
return result
|
||||
|
||||
def _elapsed_ms(self, started: float) -> int:
|
||||
return int((perf_counter() - started) * 1000)
|
||||
return max(1, round((perf_counter() - started) * 1000))
|
||||
|
||||
def _build_pre_gate_input(self, state: CodeQaExecutionState) -> dict:
|
||||
evidence = state.evidence_pack
|
||||
retrieval = state.retrieval_result
|
||||
return {
|
||||
"resolved_target": self._resolved_target(state),
|
||||
"sub_intent": state.retrieval_request.sub_intent if state.retrieval_request else None,
|
||||
"target_type": evidence.target_type if evidence else None,
|
||||
"evidence_count": evidence.evidence_count if evidence else 0,
|
||||
"code_chunk_count": len(evidence.code_chunks) if evidence else 0,
|
||||
"entrypoint_count": len(evidence.entrypoints) if evidence else 0,
|
||||
"relation_count": len(evidence.relations) if evidence else 0,
|
||||
"test_evidence_count": len(evidence.test_evidence) if evidence else 0,
|
||||
"symbol_resolution_status": retrieval.symbol_resolution_status if retrieval else None,
|
||||
"path_scope": list(state.retrieval_request.path_scope) if state.retrieval_request else [],
|
||||
}
|
||||
|
||||
def _resolved_target(self, state: CodeQaExecutionState) -> str | None:
|
||||
if state.evidence_pack and state.evidence_pack.resolved_target:
|
||||
return state.evidence_pack.resolved_target
|
||||
if state.retrieval_result and state.retrieval_result.resolved_symbol:
|
||||
return state.retrieval_result.resolved_symbol
|
||||
if state.retrieval_request and state.retrieval_request.path_scope:
|
||||
return state.retrieval_request.path_scope[0]
|
||||
return None
|
||||
|
||||
def _close_matches(self, candidates: list[str], found: list[str]) -> list[str]:
|
||||
ranked: list[tuple[float, str]] = []
|
||||
for candidate in candidates:
|
||||
for item in found:
|
||||
score = SequenceMatcher(None, candidate.lower(), item.lower()).ratio()
|
||||
if score >= 0.52:
|
||||
ranked.append((score, item))
|
||||
ranked.sort(key=lambda pair: (-pair[0], pair[1]))
|
||||
result: list[str] = []
|
||||
for _, item in ranked:
|
||||
if item not in result:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def _hydrate_entrypoint_sources(
|
||||
self,
|
||||
state: CodeQaExecutionState,
|
||||
raw_rows: list[dict],
|
||||
retrieval_report: dict,
|
||||
) -> tuple[list[dict], dict]:
|
||||
if not state.retrieval_request or state.retrieval_request.sub_intent.upper() != "FIND_ENTRYPOINTS":
|
||||
return raw_rows, retrieval_report
|
||||
entrypoint_paths = []
|
||||
for row in raw_rows:
|
||||
if str(row.get("layer") or "") != "C3_ENTRYPOINTS":
|
||||
continue
|
||||
path = str(row.get("path") or "").strip()
|
||||
if path and path not in entrypoint_paths:
|
||||
entrypoint_paths.append(path)
|
||||
if not entrypoint_paths:
|
||||
return raw_rows, retrieval_report
|
||||
extra_rows = self._retrieval.retrieve_exact_files(
|
||||
state.rag_session_id,
|
||||
paths=entrypoint_paths[:3],
|
||||
layers=["C0_SOURCE_CHUNKS"],
|
||||
limit=24,
|
||||
query=state.user_query,
|
||||
ranking_profile="entrypoint_source_hydration",
|
||||
)
|
||||
extra_report = self._retrieval.consume_retrieval_report() or {}
|
||||
return self._merge_rows(raw_rows, extra_rows), self._merge_reports(retrieval_report, extra_report)
|
||||
|
||||
def _merge_rows(self, base_rows: list[dict], extra_rows: list[dict]) -> list[dict]:
|
||||
merged: list[dict] = []
|
||||
seen: set[tuple[str, str, str, int | None, int | None]] = set()
|
||||
for row in [*base_rows, *extra_rows]:
|
||||
key = (
|
||||
str(row.get("layer") or ""),
|
||||
str(row.get("path") or ""),
|
||||
str(row.get("title") or ""),
|
||||
row.get("span_start"),
|
||||
row.get("span_end"),
|
||||
)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(row)
|
||||
return merged
|
||||
|
||||
def _merge_reports(self, base: dict, extra: dict) -> dict:
|
||||
merged = dict(base or {})
|
||||
merged["executed_layers"] = list(dict.fromkeys([*(base.get("executed_layers") or []), *(extra.get("executed_layers") or [])]))
|
||||
merged["retrieval_mode_by_layer"] = {**dict(base.get("retrieval_mode_by_layer") or {}), **dict(extra.get("retrieval_mode_by_layer") or {})}
|
||||
merged["top_k_by_layer"] = {**dict(base.get("top_k_by_layer") or {}), **dict(extra.get("top_k_by_layer") or {})}
|
||||
merged["filters_by_layer"] = {**dict(base.get("filters_by_layer") or {}), **dict(extra.get("filters_by_layer") or {})}
|
||||
merged["retrieval_by_layer_ms"] = {**dict(base.get("retrieval_by_layer_ms") or {}), **dict(extra.get("retrieval_by_layer_ms") or {})}
|
||||
merged["fallback"] = dict(extra.get("fallback") or base.get("fallback") or {"used": False, "reason": None})
|
||||
merged["supplemental_requests"] = [*(base.get("supplemental_requests") or []), *(extra.get("requests") or [])]
|
||||
return merged
|
||||
|
||||
def _fallback_mode(self, state: CodeQaExecutionState) -> str:
|
||||
status = str(state.router_result.symbol_resolution.status if state.router_result and state.router_result.symbol_resolution else "")
|
||||
if status == "ambiguous":
|
||||
return "ambiguous"
|
||||
if status == "not_found":
|
||||
return "not_found"
|
||||
return "degraded"
|
||||
|
||||
def _fallback_answer(self, state: CodeQaExecutionState) -> str:
|
||||
symbol_resolution = state.router_result.symbol_resolution if state.router_result else None
|
||||
query_plan = state.router_result.query_plan if state.router_result else None
|
||||
target = next((item for item in list(query_plan.symbol_candidates or []) if item), "запрошенная сущность") if query_plan else "запрошенная сущность"
|
||||
if symbol_resolution and symbol_resolution.status == "ambiguous":
|
||||
return self._answer_policy.decide(router_result=state.router_result, gate_decision=evaluate_evidence(state.evidence_pack)).answer
|
||||
if symbol_resolution and symbol_resolution.status == "not_found":
|
||||
return self._answer_policy.decide(router_result=state.router_result, gate_decision=evaluate_evidence(state.evidence_pack)).answer
|
||||
if state.degraded_message:
|
||||
return state.degraded_message
|
||||
return f"Недостаточно подтверждённых данных для уверенного ответа по {target}."
|
||||
|
||||
@@ -1,6 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.agent.code_qa_runtime.models import CodeQaValidationResult
|
||||
from app.modules.rag.code_qa_pipeline.answer_fact_curator import build_curated_answer_facts
|
||||
from app.modules.rag.code_qa_pipeline.contracts import EvidenceBundle
|
||||
|
||||
_TOKEN_RE = re.compile(r"[a-zA-Zа-яА-Я0-9_/]+")
|
||||
_VAGUE_PHRASES = (
|
||||
"ряд аргументов",
|
||||
"имеет responsibilities",
|
||||
"имеет responsibility",
|
||||
"ключевой компонент",
|
||||
"центральный компонент",
|
||||
"играет роль",
|
||||
"управляет системой",
|
||||
"этап пайплайна",
|
||||
"инициализация сервисов",
|
||||
"регистрация основных служб",
|
||||
)
|
||||
_OPTIMISTIC_TRACE_CLAIMS = ("полностью восстанавливается", "полный поток выполнения", "полностью прослеживается")
|
||||
|
||||
|
||||
class CodeQaPostEvidenceGate:
|
||||
@@ -10,6 +29,9 @@ class CodeQaPostEvidenceGate:
|
||||
answer: str,
|
||||
answer_mode: str,
|
||||
degraded_message: str,
|
||||
sub_intent: str,
|
||||
user_query: str,
|
||||
evidence_pack: EvidenceBundle | None,
|
||||
) -> CodeQaValidationResult:
|
||||
normalized = (answer or "").strip()
|
||||
if not normalized:
|
||||
@@ -18,6 +40,189 @@ class CodeQaPostEvidenceGate:
|
||||
return CodeQaValidationResult(passed=False, action="repair", reasons=["degraded_answer_missing_guardrail"])
|
||||
if answer_mode == "not_found" and "не найден" not in normalized.lower():
|
||||
return CodeQaValidationResult(passed=False, action="repair", reasons=["not_found_answer_missing_phrase"])
|
||||
if answer_mode == "ambiguous" and "не удалось однозначно разрешить" not in normalized.lower():
|
||||
return CodeQaValidationResult(passed=False, action="repair", reasons=["ambiguous_answer_missing_phrase"])
|
||||
if degraded_message and answer_mode != "normal" and len(normalized) < 24:
|
||||
return CodeQaValidationResult(passed=False, action="repair", reasons=["answer_too_short"])
|
||||
if answer_mode != "normal" or evidence_pack is None:
|
||||
return CodeQaValidationResult(passed=True, action="return")
|
||||
|
||||
reasons = self._normal_answer_reasons(normalized.lower(), sub_intent.upper(), user_query, evidence_pack)
|
||||
if reasons:
|
||||
return CodeQaValidationResult(passed=False, action="repair", reasons=_dedupe(reasons))
|
||||
return CodeQaValidationResult(passed=True, action="return")
|
||||
|
||||
def _normal_answer_reasons(self, answer: str, sub_intent: str, user_query: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
reasons: list[str] = []
|
||||
if sub_intent == "FIND_ENTRYPOINTS":
|
||||
reasons.extend(self._validate_entrypoints(answer, user_query, evidence_pack))
|
||||
elif sub_intent == "EXPLAIN":
|
||||
reasons.extend(self._validate_explain(answer, evidence_pack))
|
||||
elif sub_intent == "ARCHITECTURE":
|
||||
reasons.extend(self._validate_architecture(answer, evidence_pack))
|
||||
elif sub_intent == "TRACE_FLOW":
|
||||
reasons.extend(self._validate_trace_flow(answer, evidence_pack))
|
||||
return reasons
|
||||
|
||||
def _validate_entrypoints(self, answer: str, user_query: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
confirmed = [item for item in list(evidence_pack.entrypoints or []) if item.get("http_method") and item.get("route_path")]
|
||||
if not confirmed:
|
||||
return []
|
||||
query_tokens = self._tokens(user_query)
|
||||
matched = [item for item in confirmed if self._tokens(str(item.get("route_path") or "")) & query_tokens]
|
||||
required = matched or confirmed[:1]
|
||||
reasons = []
|
||||
if any(f"{item['http_method']} {item['route_path']}".lower() not in answer for item in required):
|
||||
reasons.append("missing_confirmed_route")
|
||||
if matched and any(token in query_tokens for token in {"health", "/health"}) and "не обнаруж" in answer:
|
||||
reasons.append("contradicts_confirmed_route")
|
||||
return reasons
|
||||
|
||||
def _validate_explain(self, answer: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
facts = build_curated_answer_facts(evidence_pack)
|
||||
explain = dict(facts.get("explain") or {})
|
||||
reasons = self._validate_target_focus(answer, evidence_pack)
|
||||
reasons.extend(self._vagueness_reasons(answer, "explain"))
|
||||
|
||||
matches = 0
|
||||
methods = list(explain.get("required_methods") or [])
|
||||
calls = list(explain.get("required_calls") or [])
|
||||
dependencies = list(explain.get("required_dependencies") or [])
|
||||
fields = list(explain.get("required_fields") or [])
|
||||
if methods and not self._mentions_fact_group(answer, methods):
|
||||
reasons.append("missing_concrete_methods")
|
||||
elif methods:
|
||||
matches += 1
|
||||
if calls and not self._mentions_fact_group(answer, calls):
|
||||
reasons.append("missing_concrete_calls")
|
||||
elif calls:
|
||||
matches += 1
|
||||
if dependencies and not self._mentions_fact_group(answer, dependencies):
|
||||
reasons.append("missing_concrete_dependencies")
|
||||
elif dependencies:
|
||||
matches += 1
|
||||
if fields and self._mentions_fact_group(answer, fields):
|
||||
matches += 1
|
||||
if (methods or calls or dependencies or fields) and matches == 0:
|
||||
reasons.append("too_vague_for_explain")
|
||||
if self._semantic_leakage(answer, facts, has_concrete_support=matches > 0):
|
||||
reasons.append("semantic_labels_without_code_edges")
|
||||
return reasons
|
||||
|
||||
def _validate_architecture(self, answer: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
facts = build_curated_answer_facts(evidence_pack)
|
||||
architecture = dict(facts.get("architecture") or {})
|
||||
reasons = self._validate_target_focus(answer, evidence_pack)
|
||||
reasons.extend(self._vagueness_reasons(answer, "architecture"))
|
||||
|
||||
components = list(architecture.get("required_components") or [])
|
||||
relations = list(architecture.get("required_relations") or [])
|
||||
verbs = list(architecture.get("required_relation_verbs") or [])
|
||||
if components and not self._mentions_fact_group(answer, components):
|
||||
reasons.append("missing_concrete_components")
|
||||
if relations and not self._mentions_relations(answer, relations):
|
||||
reasons.append("missing_concrete_relations")
|
||||
if verbs and not self._mentions_fact_group(answer, verbs):
|
||||
reasons.append("missing_relation_verbs")
|
||||
if any(label in answer for label in architecture.get("forbidden_labels") or []):
|
||||
reasons.append("contains_retrieval_artifacts")
|
||||
if self._methods_dominate_components(answer, components):
|
||||
reasons.append("methods_as_primary_components")
|
||||
if relations and (not self._mentions_relations(answer, relations) or not self._mentions_fact_group(answer, verbs)):
|
||||
reasons.append("too_vague_for_architecture")
|
||||
if self._semantic_leakage(answer, facts, has_concrete_support=self._mentions_relations(answer, relations)):
|
||||
reasons.append("semantic_labels_without_code_edges")
|
||||
return reasons
|
||||
|
||||
def _validate_trace_flow(self, answer: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
facts = build_curated_answer_facts(evidence_pack)
|
||||
trace = dict(facts.get("trace_flow") or {})
|
||||
reasons = self._validate_target_focus(answer, evidence_pack)
|
||||
reasons.extend(self._vagueness_reasons(answer, "trace_flow"))
|
||||
|
||||
steps = list(trace.get("required_flow_steps") or [])
|
||||
calls = list(trace.get("required_calls") or [])
|
||||
if steps and not self._mentions_steps(answer, steps):
|
||||
reasons.append("missing_flow_steps")
|
||||
if calls and not self._mentions_fact_group(answer, calls):
|
||||
reasons.append("missing_concrete_calls")
|
||||
if steps and not self._mentions_relations(answer, steps):
|
||||
reasons.append("missing_sequence_edges")
|
||||
if any(claim in answer for claim in _OPTIMISTIC_TRACE_CLAIMS):
|
||||
reasons.append("overclaims_trace_completeness")
|
||||
if steps and not (self._mentions_steps(answer, steps) and self._mentions_relations(answer, steps)):
|
||||
reasons.append("too_vague_for_trace_flow")
|
||||
return reasons
|
||||
|
||||
def _validate_target_focus(self, answer: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
target = str(evidence_pack.resolved_target or "").strip().lower()
|
||||
return [] if not target or target in answer else ["missing_resolved_target"]
|
||||
|
||||
def _mentions_fact_group(self, answer: str, values: list[str]) -> bool:
|
||||
return any(alias in answer for value in values for alias in _aliases(value))
|
||||
|
||||
def _mentions_relations(self, answer: str, relations: list[dict]) -> bool:
|
||||
for relation in relations:
|
||||
source = str(relation.get("source") or "").lower()
|
||||
target = str(relation.get("target") or "").lower()
|
||||
verb = str(relation.get("verb") or "").lower()
|
||||
if source and target and source in answer and target in answer:
|
||||
return True
|
||||
if source and verb and target and source in answer and verb in answer and target in answer:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _mentions_steps(self, answer: str, steps: list[dict]) -> bool:
|
||||
if "сначала" in answer and "затем" in answer:
|
||||
return True
|
||||
numeric_steps = sum(1 for marker in ("1.", "2.", "3.") if marker in answer)
|
||||
if numeric_steps >= 2:
|
||||
return True
|
||||
mentioned = sum(1 for step in steps[:3] if self._mentions_relations(answer, [step]))
|
||||
return mentioned >= min(2, len(steps[:3]))
|
||||
|
||||
def _methods_dominate_components(self, answer: str, components: list[str]) -> bool:
|
||||
method_like = re.findall(r"\b[a-z_]+\(\)", answer)
|
||||
component_hits = sum(1 for component in components if component.lower() in answer)
|
||||
return bool(method_like) and component_hits == 0
|
||||
|
||||
def _semantic_leakage(self, answer: str, facts: dict, *, has_concrete_support: bool) -> bool:
|
||||
if has_concrete_support:
|
||||
return False
|
||||
semantic_roles = [str(item.get("role") or "").strip().lower() for item in facts.get("semantic_hints") or [] if str(item.get("role") or "").strip()]
|
||||
return bool(semantic_roles) and any(role in answer for role in semantic_roles)
|
||||
|
||||
def _vagueness_reasons(self, answer: str, scenario: str) -> list[str]:
|
||||
if any(phrase in answer for phrase in _VAGUE_PHRASES):
|
||||
return [f"too_vague_for_{scenario}"]
|
||||
return []
|
||||
|
||||
def _tokens(self, value: str) -> set[str]:
|
||||
return {token.lower().strip("/") for token in _TOKEN_RE.findall(value or "") if token.strip("/")}
|
||||
|
||||
|
||||
def _aliases(value: str) -> list[str]:
|
||||
text = str(value or "").strip().lower()
|
||||
if not text:
|
||||
return []
|
||||
aliases = [text]
|
||||
bare = text.removesuffix("()")
|
||||
if bare != text:
|
||||
aliases.append(bare)
|
||||
tail = bare.rsplit(".", 1)[-1]
|
||||
if tail and tail not in aliases:
|
||||
aliases.append(tail)
|
||||
if tail and f"{tail}()" not in aliases:
|
||||
aliases.append(f"{tail}()")
|
||||
return aliases
|
||||
|
||||
|
||||
def _dedupe(values: list[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
if value in seen:
|
||||
continue
|
||||
seen.add(value)
|
||||
result.append(value)
|
||||
return result
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.contracts import AnswerSynthesisInput, EvidenceBundle
|
||||
|
||||
@@ -11,6 +12,7 @@ _LAYER_GUIDE = (
|
||||
"- C3_ENTRYPOINTS: подтвержденные точки входа.\n"
|
||||
"- C4_SEMANTIC_ROLES: вспомогательная роль компонента, использовать осторожно."
|
||||
)
|
||||
_TOKEN_RE = re.compile(r"[a-zA-Zа-яА-Я0-9_/]+")
|
||||
|
||||
|
||||
class CodeQaPromptPayloadBuilder:
|
||||
@@ -30,8 +32,82 @@ class CodeQaPromptPayloadBuilder:
|
||||
"fast_context": synthesis_input.fast_context,
|
||||
"deep_context": synthesis_input.deep_context,
|
||||
"evidence_summary": synthesis_input.evidence_summary,
|
||||
"semantic_hints": synthesis_input.semantic_hints,
|
||||
"diagnostic_hints": synthesis_input.diagnostic_hints,
|
||||
"retrieval_summary": evidence_pack.retrieval_summary,
|
||||
"confirmed_entrypoints": self._entrypoints(user_query, evidence_pack),
|
||||
"required_entrypoints": self._required_entrypoints(user_query, evidence_pack),
|
||||
"layer_guide": _LAYER_GUIDE,
|
||||
}
|
||||
payload.update(self._scenario_payload(synthesis_input))
|
||||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
|
||||
def _entrypoints(self, user_query: str, evidence_pack: EvidenceBundle) -> list[dict]:
|
||||
query_tokens = self._tokens(user_query)
|
||||
enriched: list[dict] = []
|
||||
for item in list(evidence_pack.entrypoints or []):
|
||||
route_path = str(item.get("route_path") or "").strip()
|
||||
http_method = str(item.get("http_method") or "").strip().upper()
|
||||
if not http_method:
|
||||
continue
|
||||
candidate = {
|
||||
"http_method": http_method,
|
||||
"route_path": route_path,
|
||||
"display": f"{http_method} {route_path}".strip(),
|
||||
"handler_symbol": str(item.get("handler_symbol") or "").strip(),
|
||||
"path": str(item.get("path") or "").strip(),
|
||||
"start_line": item.get("start_line"),
|
||||
"end_line": item.get("end_line"),
|
||||
"query_match": self._matches_query(route_path, query_tokens),
|
||||
}
|
||||
if candidate not in enriched:
|
||||
enriched.append(candidate)
|
||||
return sorted(enriched, key=lambda item: (-int(bool(item["query_match"])), -int(bool(item["route_path"])), len(item["route_path"] or "/"), item["display"]))
|
||||
|
||||
def _required_entrypoints(self, user_query: str, evidence_pack: EvidenceBundle) -> list[str]:
|
||||
return [item["display"] for item in self._entrypoints(user_query, evidence_pack) if item["query_match"] and item["route_path"]]
|
||||
|
||||
def _scenario_payload(self, synthesis_input: AnswerSynthesisInput) -> dict:
|
||||
scenario = (synthesis_input.resolved_scenario or "").upper()
|
||||
curated = dict(synthesis_input.curated_facts or {})
|
||||
if scenario == "EXPLAIN":
|
||||
facts = dict(curated.get("explain") or {})
|
||||
return {
|
||||
"must_mention_methods": facts.get("required_methods", []),
|
||||
"must_mention_fields": facts.get("required_fields", []),
|
||||
"must_mention_calls": facts.get("required_calls", []),
|
||||
"must_mention_dependencies": facts.get("required_dependencies", []),
|
||||
"must_mention_constructor_args": facts.get("required_constructor_args", []),
|
||||
"must_mention_files": facts.get("required_files", []),
|
||||
"must_not_infer_missing_details": True,
|
||||
"fact_gaps": facts.get("fact_gaps", []),
|
||||
}
|
||||
if scenario == "ARCHITECTURE":
|
||||
facts = dict(curated.get("architecture") or {})
|
||||
return {
|
||||
"must_mention_components": facts.get("required_components", []),
|
||||
"must_mention_relations": facts.get("required_relations", []),
|
||||
"must_use_relation_verbs": facts.get("required_relation_verbs", []),
|
||||
"must_avoid_semantic_labels_as_primary_claims": True,
|
||||
"must_not_use_retrieval_labels": facts.get("forbidden_labels", []),
|
||||
"fact_gaps": facts.get("fact_gaps", []),
|
||||
}
|
||||
if scenario == "TRACE_FLOW":
|
||||
facts = dict(curated.get("trace_flow") or {})
|
||||
return {
|
||||
"must_mention_flow_steps": facts.get("required_flow_steps", []),
|
||||
"must_mention_calls": facts.get("required_calls", []),
|
||||
"must_mention_sequence_edges": facts.get("required_sequence_edges", []),
|
||||
"must_avoid_overclaiming_full_flow": True,
|
||||
"fact_gaps": facts.get("fact_gaps", []),
|
||||
}
|
||||
return {}
|
||||
|
||||
def _matches_query(self, route_path: str, query_tokens: set[str]) -> bool:
|
||||
if not route_path:
|
||||
return False
|
||||
path_tokens = self._tokens(route_path)
|
||||
return bool(path_tokens & query_tokens)
|
||||
|
||||
def _tokens(self, value: str) -> set[str]:
|
||||
return {token.lower().strip("/") for token in _TOKEN_RE.findall(value or "") if token.strip("/")}
|
||||
|
||||
@@ -17,10 +17,12 @@ class CodeQaAnswerRepairService:
|
||||
validation: CodeQaValidationResult,
|
||||
prompt_payload: str,
|
||||
) -> str:
|
||||
repair_focus = self._repair_focus(validation.reasons)
|
||||
repair_input = json.dumps(
|
||||
{
|
||||
"draft_answer": draft_answer,
|
||||
"validation_reasons": validation.reasons,
|
||||
"repair_focus": repair_focus,
|
||||
"prompt_payload": prompt_payload,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
@@ -31,3 +33,24 @@ class CodeQaAnswerRepairService:
|
||||
repair_input,
|
||||
log_context="graph.project_qa.code_qa.repair",
|
||||
).strip()
|
||||
|
||||
def _repair_focus(self, reasons: list[str]) -> list[str]:
|
||||
mapping = {
|
||||
"missing_concrete_methods": "missing_concrete_methods",
|
||||
"missing_concrete_calls": "missing_concrete_calls",
|
||||
"missing_concrete_dependencies": "missing_concrete_dependencies",
|
||||
"missing_concrete_components": "missing_concrete_components",
|
||||
"missing_concrete_relations": "missing_concrete_relations",
|
||||
"missing_relation_verbs": "missing_relation_verbs",
|
||||
"missing_flow_steps": "missing_flow_steps",
|
||||
"missing_sequence_edges": "missing_sequence_edges",
|
||||
"too_vague_for_explain": "too_vague_for_explain",
|
||||
"too_vague_for_architecture": "too_vague_for_architecture",
|
||||
"too_vague_for_trace_flow": "too_vague_for_trace_flow",
|
||||
"semantic_labels_without_code_edges": "semantic_labels_without_code_edges",
|
||||
"contains_retrieval_artifacts": "contains_retrieval_artifacts",
|
||||
"methods_as_primary_components": "methods_as_primary_components",
|
||||
"overclaims_trace_completeness": "overclaims_trace_completeness",
|
||||
}
|
||||
result = [mapping[reason] for reason in reasons if reason in mapping]
|
||||
return result or ["tighten_to_evidence"]
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class CodeQaShortAnswerFormatter:
|
||||
def open_file_not_found(self, target: str) -> str:
|
||||
return f"Файл {target} не найден."
|
||||
|
||||
def entity_not_found(self, target: str, alternatives: list[str]) -> str:
|
||||
base = f"Сущность {target} не найдена в доступном коде."
|
||||
suffix = self._alternatives(alternatives)
|
||||
return f"{base} {suffix}".strip()
|
||||
|
||||
def entity_ambiguous(self, target: str, alternatives: list[str]) -> str:
|
||||
base = f"Сущность {target} не удалось однозначно разрешить."
|
||||
suffix = self._alternatives(alternatives)
|
||||
return f"{base} {suffix}".strip()
|
||||
|
||||
def insufficient(self, message: str) -> str:
|
||||
normalized = (message or "").strip()
|
||||
if normalized:
|
||||
return normalized
|
||||
return "Недостаточно подтверждённых данных для уверенного ответа."
|
||||
|
||||
def related_only(self, target: str, alternatives: list[str]) -> str:
|
||||
base = f"Прямых подтверждений для сущности {target} не найдено."
|
||||
suffix = self._alternatives(alternatives)
|
||||
return f"{base} {suffix}".strip()
|
||||
|
||||
def _alternatives(self, alternatives: list[str]) -> str:
|
||||
items = [item.strip() for item in alternatives if item and item.strip()]
|
||||
if not items:
|
||||
return ""
|
||||
return f"Близкие варианты: {', '.join(items[:3])}."
|
||||
@@ -17,9 +17,15 @@
|
||||
Не выводи пустые разделы, пустые списки и формулировки вида "кандидатов нет", если это не помогает ответу.
|
||||
|
||||
Дай архитектурное объяснение без лишней теории.
|
||||
Назови подтверждённые компоненты и конкретные связи между ними: создаёт, вызывает, регистрирует, читает, пишет, передаёт, оборачивает.
|
||||
Строй ответ вокруг concrete facts из payload: `must_mention_components`, `must_mention_relations`, `must_use_relation_verbs`.
|
||||
Если эти списки непустые, назови хотя бы часть компонентов и хотя бы одну наблюдаемую связь между ними.
|
||||
Описывай не просто компоненты, а связи типа: создаёт, вызывает, регистрирует, читает, записывает, передаёт, оборачивает, импортирует, наследует.
|
||||
Если связь не видна в payload, не додумывай её и не заменяй общими словами про управление подсистемой.
|
||||
Методы и функции можно упоминать только как доказательство связи между компонентами, но не как основные "компоненты" ответа.
|
||||
Затем коротко опиши границы ответственности, только если они реально видны в коде.
|
||||
Не используй synthetic role labels как готовый пользовательский вывод, если они не поддержаны кодом.
|
||||
Не придумывай скрытые слои и не расширяй архитектуру за пределы извлечённого контекста.
|
||||
Не используй обязательные markdown-секции.
|
||||
Не используй абстрактные формулы вроде "главный компонент", "центральный управляющий компонент", "управляет потоками данных и состоянием системы", если конкретная связь не раскрыта через наблюдаемые методы, поля или вызовы.
|
||||
Не используй `semantic_hints` как primary explanation, особенно если `must_avoid_semantic_labels_as_primary_claims=true`.
|
||||
Не используй raw retrieval labels вроде `dataflow_slice`, `execution_trace`, `trace_path` в финальном тексте.
|
||||
Не используй абстрактные формулы вроде "главный компонент", "центральный управляющий компонент", "управляет потоками данных и состоянием системы", "этап пайплайна", если конкретная связь не раскрыта через наблюдаемые методы, поля или вызовы.
|
||||
|
||||
@@ -18,9 +18,15 @@
|
||||
|
||||
Объясни, как работает сущность из вопроса пользователя, обычным инженерным текстом.
|
||||
Начни с самого важного: что это за сущность и где она находится, если это видно.
|
||||
Затем кратко опиши подтверждённые зависимости, вызовы, аргументы, поля или шаги работы, только если они реально видны.
|
||||
Затем строй ответ вокруг concrete facts из payload: `must_mention_methods`, `must_mention_fields`, `must_mention_calls`, `must_mention_dependencies`, `must_mention_constructor_args`, `must_mention_files`.
|
||||
Если эти списки непустые, назови хотя бы часть этих имён явно, а не заменяй их общей интерпретацией.
|
||||
Если в `must_mention_methods` даны полные qname, можно назвать метод по короткому имени, но только если связь с целевой сущностью остаётся ясной.
|
||||
Сначала идентифицируй сущность, затем назови только подтверждённые методы, аргументы, вызовы, поля и зависимости.
|
||||
Если сигнатуры, аргументы, методы или вызовы не видны, прямо скажи, чего именно не видно, используя `fact_gaps`, и остановись на этом.
|
||||
Не используй общие формулы без конкретных имён.
|
||||
Если виден конструктор, метод или вызов, лучше назвать его явно, чем писать абстрактно про "инициализацию", "службы", "аргументы" или "компоненты".
|
||||
Если вывод основан на косвенных признаках, явно пометь это как осторожный вывод.
|
||||
Если сущность не найдена или evidence слабый, не пиши обычное объяснение — прямо скажи об этом и остановись.
|
||||
Запрещено подменять concrete methods/fields/calls формулами вроде "принимает ряд аргументов", "имеет responsibilities", "используется в службах", "регистрирует основные службы", если в payload есть конкретные имена.
|
||||
Не используй `semantic_hints` как основной каркас ответа. Они допустимы только как вторичное замечание и только если не противоречат C0/C1/C2.
|
||||
Не используй обязательные секции и подзаголовки.
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
Найди точки входа, обработчики запуска или важные entrypoints.
|
||||
Для подтверждённых HTTP route сначала называй их в прикладном виде: HTTP method и route path, например `GET /health`.
|
||||
Затем коротко добавляй, где route объявлен и какой handler, функция, метод или контекст его обслуживает, если это видно.
|
||||
Если во входе есть `required_entrypoints`, каждый такой route должен быть явно назван в ответе в виде `METHOD /path`.
|
||||
Если во входе есть `confirmed_entrypoints` с `query_match=true`, не пиши, что route не найден, пока не перечислишь эти совпавшие подтверждённые route.
|
||||
Подтверждённые entrypoints перечисляй первыми.
|
||||
Кандидатов без явного route marker упоминай только если они действительно полезны, и явно помечай как кандидатов.
|
||||
Не своди ответ к обсуждению декораторов вроде `@app.get`; пользователю важнее method, path и контекст.
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
Ты исправляешь черновой ответ по коду после проверки groundedness.
|
||||
Сделай ответ короче, точнее и строже по evidence payload.
|
||||
Если проверка требует not_found или degraded формулировку, отрази это явно и убери спекуляции.
|
||||
Если в `repair_focus` есть причины для `EXPLAIN`, перепиши ответ так, чтобы он назвал concrete methods, calls, fields, constructor args или dependencies из payload, а не общие responsibilities.
|
||||
Если в `repair_focus` есть причины для `ARCHITECTURE`, перепиши ответ так, чтобы он назвал concrete components и связи с relation verbs из payload: создает, вызывает, читает, записывает, импортирует, наследует.
|
||||
Если в `repair_focus` есть причины для `TRACE_FLOW`, перепиши ответ как последовательность concrete steps с явными methods/calls/edges из payload. Если виден только partial flow, так и скажи.
|
||||
Если в `repair_focus` есть `semantic_labels_without_code_edges`, убери semantic role labels из основной формулировки, если они не подкреплены concrete code edges.
|
||||
Если в `repair_focus` есть `contains_retrieval_artifacts` или `methods_as_primary_components`, убери raw retrieval labels и не выдавай методы за компоненты.
|
||||
Если в `repair_focus` есть `overclaims_trace_completeness`, убери фразы про полный/полностью восстановленный flow, если payload не подтверждает это явно.
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
Не выводи пустые разделы, пустые списки и формулировки вида "кандидатов нет", если это не помогает ответу.
|
||||
|
||||
Проследи поток выполнения или поток данных по найденным артефактам.
|
||||
Старайся описывать шаги последовательно и коротко, без лишних подзаголовков.
|
||||
Строй ответ вокруг `must_mention_flow_steps`, `must_mention_calls` и `must_mention_sequence_edges` из payload.
|
||||
Старайся описывать шаги последовательно и коротко, без лишних подзаголовков: сначала, затем, после этого, в конце.
|
||||
Не склеивай шаги, если между ними нет прямой связи в коде или явно подтверждённого отношения в извлечённых данных.
|
||||
Если поток восстанавливается только частично, так и скажи.
|
||||
Если поток восстанавливается только частично, так и скажи, опираясь на `fact_gaps`, и не заявляй, что flow восстановлен полностью.
|
||||
Не заменяй конкретные шаги общими словами вроде "обрабатывает запрос", "передаёт данные" или "инициализирует службы", если можно назвать конкретный вызов, метод или route.
|
||||
Не используй сильные формулировки вроде "полностью восстанавливается", "полный поток виден", если payload показывает только часть цепочки.
|
||||
|
||||
@@ -0,0 +1,300 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.contracts import CodeChunkItem, EvidenceBundle
|
||||
|
||||
_CALL_RE = re.compile(r"([A-Za-z_][\w\.]*)\s*\(")
|
||||
_FIELD_RE = re.compile(r"self\.(\w+)")
|
||||
_SIGNATURE_RE = re.compile(r"(?P<name>[A-Za-z_][\w\.]*)\((?P<args>[^)]*)\)")
|
||||
_RETRIEVAL_LABELS = ("dataflow_slice", "execution_trace", "trace_path")
|
||||
_RELATION_VERBS = {
|
||||
"calls": "вызывает",
|
||||
"instantiates": "создает",
|
||||
"inherits": "наследует",
|
||||
"imports": "импортирует",
|
||||
"reads_attr": "читает",
|
||||
"writes_attr": "записывает",
|
||||
}
|
||||
|
||||
|
||||
def build_curated_answer_facts(bundle: EvidenceBundle) -> dict[str, Any]:
|
||||
target = str(bundle.resolved_target or "").strip()
|
||||
semantic_hints = _semantic_hints(bundle.code_chunks)
|
||||
primary_chunks = [chunk for chunk in bundle.code_chunks if chunk.layer != "C4_SEMANTIC_ROLES"]
|
||||
relations = _normalized_relations(bundle.relations)
|
||||
target_relations = [relation for relation in relations if _is_target_relation(relation, target)]
|
||||
|
||||
return {
|
||||
"scenario": (bundle.resolved_sub_intent or "EXPLAIN").upper(),
|
||||
"semantic_hints": semantic_hints,
|
||||
"primary_chunk_count": len(primary_chunks),
|
||||
"relation_count": len(relations),
|
||||
"explain": _explain_facts(bundle, primary_chunks, target_relations),
|
||||
"architecture": _architecture_facts(bundle, primary_chunks, target_relations),
|
||||
"trace_flow": _trace_flow_facts(bundle, primary_chunks, target_relations),
|
||||
}
|
||||
|
||||
|
||||
def _explain_facts(bundle: EvidenceBundle, chunks: list[CodeChunkItem], relations: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
target = str(bundle.resolved_target or "").strip()
|
||||
signatures = [_signature_payload(chunk) for chunk in chunks if chunk.layer == "C1_SYMBOL_CATALOG"]
|
||||
target_signatures = [item for item in signatures if _is_target_symbol(item["name"], target)]
|
||||
methods = _unique(item["name"] for item in target_signatures if item["kind"] == "method")
|
||||
constructor_args = _unique(
|
||||
arg
|
||||
for item in target_signatures
|
||||
if item["name"].endswith(".__init__")
|
||||
for arg in item["args"]
|
||||
if arg not in {"self", "cls"}
|
||||
)
|
||||
calls = _unique(
|
||||
_display_call_target(relation["target"])
|
||||
for relation in relations
|
||||
if relation["edge_type"] in {"calls", "instantiates"}
|
||||
)
|
||||
if not calls:
|
||||
calls = _fallback_calls(chunks, target)
|
||||
fields = _unique(
|
||||
relation["target"].split(".", 1)[-1]
|
||||
for relation in relations
|
||||
if relation["edge_type"] in {"reads_attr", "writes_attr"}
|
||||
)
|
||||
if not fields:
|
||||
fields = _unique(field for chunk in chunks if _chunk_matches_target(chunk, target) for field in _FIELD_RE.findall(chunk.content or ""))
|
||||
dependencies = _unique(
|
||||
_display_dependency_target(relation["target"])
|
||||
for relation in relations
|
||||
if relation["edge_type"] in {"imports", "instantiates"}
|
||||
)
|
||||
required_files = _unique(chunk.path for chunk in chunks if _chunk_matches_target(chunk, target) and chunk.path)
|
||||
required_symbols = _unique(item["name"] for item in target_signatures if item["name"]) or _unique([target] if target else bundle.target_symbol_candidates)
|
||||
|
||||
fact_gaps: list[str] = []
|
||||
if not methods:
|
||||
fact_gaps.append("Методы целевой сущности не подтверждены в извлеченных C0/C1 данных.")
|
||||
if not calls:
|
||||
fact_gaps.append("Конкретные вызовы целевой сущности не подтверждены в C2/C0.")
|
||||
if not dependencies:
|
||||
fact_gaps.append("Явные зависимости целевой сущности не подтверждены.")
|
||||
|
||||
return {
|
||||
"required_symbols": required_symbols[:8],
|
||||
"required_methods": methods[:6],
|
||||
"required_calls": calls[:6],
|
||||
"required_fields": fields[:6],
|
||||
"required_constructor_args": constructor_args[:6],
|
||||
"required_dependencies": dependencies[:6],
|
||||
"required_files": required_files[:4],
|
||||
"fact_gaps": fact_gaps,
|
||||
}
|
||||
|
||||
|
||||
def _architecture_facts(bundle: EvidenceBundle, chunks: list[CodeChunkItem], relations: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
target = str(bundle.resolved_target or "").strip()
|
||||
components = _unique(
|
||||
([target] if target else [])
|
||||
+ [_component_name(relation["source"]) for relation in relations]
|
||||
+ [_component_name(relation["target"]) for relation in relations]
|
||||
+ [_component_name(chunk.title) for chunk in chunks if _chunk_matches_target(chunk, target)]
|
||||
)
|
||||
relation_rows = [
|
||||
{
|
||||
"source": _component_name(relation["source"]),
|
||||
"verb": relation["verb"],
|
||||
"target": _component_name(relation["target"]),
|
||||
"source_method": relation["source"],
|
||||
"target_method": relation["target"],
|
||||
"path": relation["path"],
|
||||
"line_span": relation["line_span"],
|
||||
"edge_type": relation["edge_type"],
|
||||
}
|
||||
for relation in relations
|
||||
if _component_name(relation["source"]) and _component_name(relation["target"])
|
||||
]
|
||||
relation_rows = [row for row in relation_rows if row["source"] != row["target"]]
|
||||
relation_verbs = _unique(row["verb"] for row in relation_rows if row["verb"])
|
||||
fact_gaps: list[str] = []
|
||||
if not relation_rows:
|
||||
fact_gaps.append("Concrete code edges между компонентами не подтверждены.")
|
||||
|
||||
return {
|
||||
"required_components": components[:8],
|
||||
"required_relations": relation_rows[:8],
|
||||
"required_relation_verbs": relation_verbs[:6],
|
||||
"required_creation_edges": [row for row in relation_rows if row["edge_type"] == "instantiates"][:4],
|
||||
"required_call_edges": [row for row in relation_rows if row["edge_type"] == "calls"][:4],
|
||||
"required_registration_edges": [row for row in relation_rows if row["edge_type"] == "imports"][:4],
|
||||
"required_data_edges": [row for row in relation_rows if row["edge_type"] in {"reads_attr", "writes_attr"}][:4],
|
||||
"fact_gaps": fact_gaps,
|
||||
"forbidden_labels": list(_RETRIEVAL_LABELS),
|
||||
}
|
||||
|
||||
|
||||
def _trace_flow_facts(bundle: EvidenceBundle, chunks: list[CodeChunkItem], relations: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
target = str(bundle.resolved_target or "").strip()
|
||||
flow_steps = [
|
||||
{
|
||||
"step": index,
|
||||
"source": relation["source"],
|
||||
"verb": relation["verb"],
|
||||
"target": relation["target"],
|
||||
"path": relation["path"],
|
||||
"line_span": relation["line_span"],
|
||||
}
|
||||
for index, relation in enumerate(sorted(relations, key=lambda item: (item["path"], item["sort_line"], item["source"], item["target"])), start=1)
|
||||
]
|
||||
fact_gaps: list[str] = []
|
||||
if len(flow_steps) < 2:
|
||||
fact_gaps.append("Полная последовательность шагов не подтверждена; виден только частичный flow.")
|
||||
if not flow_steps:
|
||||
fact_gaps.append("Конкретные sequence edges для flow не подтверждены.")
|
||||
|
||||
return {
|
||||
"required_flow_steps": flow_steps[:8],
|
||||
"required_calls": _unique(_display_call_target(item["target"]) for item in flow_steps),
|
||||
"required_sequence_edges": flow_steps[:8],
|
||||
"required_files": _unique(chunk.path for chunk in chunks if _chunk_matches_target(chunk, target) and chunk.path)[:4],
|
||||
"fact_gaps": fact_gaps,
|
||||
}
|
||||
|
||||
|
||||
def _semantic_hints(chunks: list[CodeChunkItem]) -> list[dict[str, Any]]:
|
||||
hints: list[dict[str, Any]] = []
|
||||
for chunk in chunks:
|
||||
if chunk.layer != "C4_SEMANTIC_ROLES":
|
||||
continue
|
||||
meta = dict(chunk.metadata or {})
|
||||
hints.append(
|
||||
{
|
||||
"symbol": meta.get("symbol_name") or chunk.title,
|
||||
"role": meta.get("role"),
|
||||
"path": chunk.path,
|
||||
"confidence": meta.get("confidence"),
|
||||
}
|
||||
)
|
||||
return hints[:8]
|
||||
|
||||
|
||||
def _normalized_relations(relations: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
normalized: list[dict[str, Any]] = []
|
||||
for relation in relations:
|
||||
metadata = dict(relation.get("metadata") or {})
|
||||
edge_type = str(metadata.get("edge_type") or relation.get("edge_type") or "").strip()
|
||||
source = _clean_endpoint(str(metadata.get("src_qname") or relation.get("source") or "").strip())
|
||||
target = _clean_endpoint(str(metadata.get("dst_ref") or relation.get("target") or "").strip())
|
||||
if not edge_type or not source or not target:
|
||||
continue
|
||||
normalized.append(
|
||||
{
|
||||
"edge_type": edge_type,
|
||||
"source": source,
|
||||
"target": target,
|
||||
"verb": _RELATION_VERBS.get(edge_type, edge_type),
|
||||
"path": str(relation.get("path") or ""),
|
||||
"line_span": _line_span(relation),
|
||||
"sort_line": int(relation.get("start_line") or 0),
|
||||
}
|
||||
)
|
||||
return normalized
|
||||
|
||||
|
||||
def _signature_payload(chunk: CodeChunkItem) -> dict[str, Any]:
|
||||
meta = dict(chunk.metadata or {})
|
||||
signature = str(meta.get("signature") or chunk.content or "").strip()
|
||||
match = _SIGNATURE_RE.search(signature)
|
||||
args = [part.strip() for part in match.group("args").split(",") if part.strip()] if match else []
|
||||
return {
|
||||
"name": _clean_endpoint(str(meta.get("qname") or chunk.title or "").strip()),
|
||||
"kind": str(meta.get("kind") or "").strip(),
|
||||
"args": args,
|
||||
}
|
||||
|
||||
|
||||
def _fallback_calls(chunks: list[CodeChunkItem], target: str) -> list[str]:
|
||||
values: list[str] = []
|
||||
for chunk in chunks:
|
||||
if not _chunk_matches_target(chunk, target):
|
||||
continue
|
||||
values.extend(_display_call_target(item) for item in _CALL_RE.findall(chunk.content or ""))
|
||||
return _unique(values)
|
||||
|
||||
|
||||
def _chunk_matches_target(chunk: CodeChunkItem, target: str) -> bool:
|
||||
if not target:
|
||||
return True
|
||||
title = _clean_endpoint(chunk.title)
|
||||
symbol = _clean_endpoint(str(dict(chunk.metadata or {}).get("qname") or ""))
|
||||
return _is_target_symbol(title, target) or _is_target_symbol(symbol, target) or target.lower() in (chunk.content or "").lower()
|
||||
|
||||
|
||||
def _is_target_relation(relation: dict[str, Any], target: str) -> bool:
|
||||
if not target:
|
||||
return True
|
||||
return _is_target_symbol(relation["source"], target) or _component_name(relation["source"]) == target or _component_name(relation["target"]) == target
|
||||
|
||||
|
||||
def _is_target_symbol(name: str, target: str) -> bool:
|
||||
clean_name = _clean_endpoint(name)
|
||||
clean_target = _clean_endpoint(target)
|
||||
if not clean_target:
|
||||
return True
|
||||
return clean_name == clean_target or clean_name.startswith(f"{clean_target}.")
|
||||
|
||||
|
||||
def _component_name(value: str) -> str:
|
||||
clean = _clean_endpoint(value)
|
||||
if not clean:
|
||||
return ""
|
||||
parts = clean.split(".")
|
||||
if len(parts) <= 1:
|
||||
return clean
|
||||
if len(parts) == 2:
|
||||
return clean if parts[1][:1].isupper() else parts[0]
|
||||
return ".".join(parts[:-1])
|
||||
|
||||
|
||||
def _display_call_target(value: str) -> str:
|
||||
clean = _clean_endpoint(value)
|
||||
if not clean:
|
||||
return ""
|
||||
tail = clean.rsplit(".", 1)[-1]
|
||||
return f"{tail}()" if tail and tail != clean else clean
|
||||
|
||||
|
||||
def _display_dependency_target(value: str) -> str:
|
||||
clean = _clean_endpoint(value)
|
||||
if not clean:
|
||||
return ""
|
||||
return _component_name(clean) or clean
|
||||
|
||||
|
||||
def _clean_endpoint(value: str) -> str:
|
||||
clean = str(value or "").strip()
|
||||
if not clean:
|
||||
return ""
|
||||
for label in _RETRIEVAL_LABELS:
|
||||
clean = clean.replace(f".{label}", "")
|
||||
clean = clean.replace(label, "")
|
||||
return clean.strip(".: ")
|
||||
|
||||
|
||||
def _line_span(item: dict[str, Any]) -> str:
|
||||
start = item.get("start_line")
|
||||
end = item.get("end_line")
|
||||
if start is None and end is None:
|
||||
return "?:?"
|
||||
return f"{start or '?'}-{end or '?'}"
|
||||
|
||||
|
||||
def _unique(values: Any) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
result: list[str] = []
|
||||
for value in values:
|
||||
text = str(value or "").strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.answer_fact_curator import build_curated_answer_facts
|
||||
from app.modules.rag.code_qa_pipeline.contracts import AnswerSynthesisInput, EvidenceBundle
|
||||
|
||||
|
||||
@@ -13,11 +14,16 @@ def build_answer_synthesis_input(
|
||||
scenario = bundle.resolved_sub_intent or "EXPLAIN"
|
||||
target = bundle.resolved_target
|
||||
sufficient = bundle.sufficient
|
||||
curated = build_curated_answer_facts(bundle)
|
||||
semantic_hints = list(curated.get("semantic_hints") or [])
|
||||
|
||||
fast_lines = [
|
||||
f"Scenario: {scenario}",
|
||||
f"Target: {target or 'none'}",
|
||||
f"Evidence chunks: {bundle.evidence_count}",
|
||||
f"Primary chunks: {curated.get('primary_chunk_count', 0)}",
|
||||
f"Semantic hints: {len(semantic_hints)}",
|
||||
f"Relations: {curated.get('relation_count', 0)}",
|
||||
f"Sufficient: {sufficient}",
|
||||
]
|
||||
if bundle.failure_reasons:
|
||||
@@ -25,18 +31,21 @@ def build_answer_synthesis_input(
|
||||
fast_context = "\n".join(fast_lines)
|
||||
|
||||
deep_parts = []
|
||||
for i, c in enumerate(bundle.code_chunks[:30], 1):
|
||||
primary_chunks = [chunk for chunk in bundle.code_chunks if scenario.upper() not in {"EXPLAIN", "ARCHITECTURE"} or chunk.layer != "C4_SEMANTIC_ROLES"]
|
||||
for i, c in enumerate(primary_chunks[:30], 1):
|
||||
deep_parts.append(f"[{i}] {c.path}:{c.start_line or '?'}-{c.end_line or '?'}\n{c.content[:800]}")
|
||||
deep_context = "\n\n---\n\n".join(deep_parts) if deep_parts else "(no code chunks)"
|
||||
|
||||
evidence_summary = []
|
||||
for c in bundle.code_chunks[:20]:
|
||||
for c in primary_chunks[:20]:
|
||||
evidence_summary.append({
|
||||
"layer": c.layer,
|
||||
"path": c.path,
|
||||
"title": c.title,
|
||||
"lines": f"{c.start_line or '?'}-{c.end_line or '?'}",
|
||||
})
|
||||
if semantic_hints:
|
||||
evidence_summary.append({"kind": "semantic_hints", "count": len(semantic_hints)})
|
||||
if bundle.entrypoints:
|
||||
evidence_summary.append({"kind": "entrypoints", "count": len(bundle.entrypoints)})
|
||||
if bundle.test_evidence:
|
||||
@@ -45,6 +54,9 @@ def build_answer_synthesis_input(
|
||||
hints = list(bundle.failure_reasons) if bundle.failure_reasons else []
|
||||
if not sufficient and bundle.retrieval_summary.get("missing_layers"):
|
||||
hints.append(f"Missing layers: {', '.join(bundle.retrieval_summary['missing_layers'])}")
|
||||
scenario_key = scenario.lower()
|
||||
scenario_facts = dict(curated.get(scenario_key) or {})
|
||||
hints.extend(list(scenario_facts.get("fact_gaps") or []))
|
||||
|
||||
return AnswerSynthesisInput(
|
||||
user_question=user_question,
|
||||
@@ -53,6 +65,8 @@ def build_answer_synthesis_input(
|
||||
fast_context=fast_context,
|
||||
deep_context=deep_context,
|
||||
evidence_summary=evidence_summary,
|
||||
semantic_hints=semantic_hints,
|
||||
curated_facts=curated,
|
||||
evidence_sufficient=sufficient,
|
||||
diagnostic_hints=hints,
|
||||
)
|
||||
|
||||
@@ -98,6 +98,7 @@ class RetrievalResult(BaseModel):
|
||||
file_candidates: list[str] = Field(default_factory=list)
|
||||
code_chunks: list[CodeChunkItem] = Field(default_factory=list)
|
||||
relations: list[dict[str, Any]] = Field(default_factory=list)
|
||||
semantic_hints: list[dict[str, Any]] = Field(default_factory=list)
|
||||
entrypoints: list[dict[str, Any]] = Field(default_factory=list)
|
||||
test_candidates: list[dict[str, Any]] = Field(default_factory=list)
|
||||
layer_outcomes: list[LayerOutcome] = Field(default_factory=list)
|
||||
@@ -138,6 +139,8 @@ class AnswerSynthesisInput(BaseModel):
|
||||
fast_context: str = ""
|
||||
deep_context: str = ""
|
||||
evidence_summary: list[dict[str, Any]] = Field(default_factory=list)
|
||||
semantic_hints: list[dict[str, Any]] = Field(default_factory=list)
|
||||
curated_facts: dict[str, Any] = Field(default_factory=dict)
|
||||
evidence_sufficient: bool = False
|
||||
diagnostic_hints: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ def build_evidence_bundle(
|
||||
summary = {
|
||||
"chunk_count": len(retrieval_result.code_chunks),
|
||||
"relation_count": len(retrieval_result.relations),
|
||||
"semantic_hint_count": len(retrieval_result.semantic_hints),
|
||||
"entrypoint_count": len(retrieval_result.entrypoints),
|
||||
"test_candidate_count": len(retrieval_result.test_candidates),
|
||||
"layers_with_hits": [
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.contracts import (
|
||||
CodeChunkItem,
|
||||
LayerOutcome,
|
||||
RetrievalResult,
|
||||
)
|
||||
import re
|
||||
|
||||
from app.modules.rag.code_qa_pipeline.contracts import CodeChunkItem, LayerOutcome, RetrievalResult
|
||||
from app.modules.rag.retrieval.test_filter import is_test_path
|
||||
|
||||
_ROUTE_RE = re.compile(r'@[\w\.]+\.(get|post|put|delete|patch|options|head)\(\s*["\']([^"\']+)["\']')
|
||||
_DEF_RE = re.compile(r"async\s+def\s+(\w+)|def\s+(\w+)")
|
||||
|
||||
|
||||
def build_retrieval_result(
|
||||
raw_rows: list[dict],
|
||||
@@ -21,52 +22,210 @@ def build_retrieval_result(
|
||||
layers_seen: set[str] = set()
|
||||
code_chunks: list[CodeChunkItem] = []
|
||||
relations: list[dict] = []
|
||||
entrypoints: list[dict] = []
|
||||
semantic_hints: list[dict] = []
|
||||
entrypoint_rows: list[CodeChunkItem] = []
|
||||
test_candidates: list[dict] = []
|
||||
file_candidates: list[str] = []
|
||||
target_symbols: list[str] = []
|
||||
|
||||
for row in raw_rows:
|
||||
layer = str(row.get("layer") or "")
|
||||
path = str(row.get("path") or "")
|
||||
chunk = _to_chunk(row)
|
||||
layer = chunk.layer
|
||||
path = chunk.path
|
||||
if layer:
|
||||
layers_seen.add(layer)
|
||||
if path and path not in file_candidates:
|
||||
file_candidates.append(path)
|
||||
|
||||
chunk = CodeChunkItem(
|
||||
layer=layer,
|
||||
path=path,
|
||||
title=str(row.get("title") or ""),
|
||||
content=str(row.get("content") or ""),
|
||||
start_line=row.get("span_start"),
|
||||
end_line=row.get("span_end"),
|
||||
metadata=dict(row.get("metadata") or {}),
|
||||
)
|
||||
code_chunks.append(chunk)
|
||||
|
||||
if layer == "C1_SYMBOL_CATALOG":
|
||||
title = str(row.get("title") or "").strip()
|
||||
title = chunk.title.strip()
|
||||
if title and title not in target_symbols:
|
||||
target_symbols.append(title)
|
||||
elif layer == "C2_DEPENDENCY_GRAPH":
|
||||
relations.append({"path": path, "metadata": chunk.metadata, "content": chunk.content[:500]})
|
||||
elif layer == "C3_ENTRYPOINTS":
|
||||
entrypoints.append({"path": path, "title": chunk.title, "metadata": chunk.metadata})
|
||||
elif is_test_path(path):
|
||||
continue
|
||||
if layer == "C2_DEPENDENCY_GRAPH":
|
||||
relations.append(_relation_payload(chunk))
|
||||
continue
|
||||
if layer == "C4_SEMANTIC_ROLES":
|
||||
semantic_hints.append(_semantic_hint_payload(chunk))
|
||||
continue
|
||||
if layer == "C3_ENTRYPOINTS":
|
||||
entrypoint_rows.append(chunk)
|
||||
continue
|
||||
if is_test_path(path):
|
||||
test_candidates.append({"path": path, "title": chunk.title, "content": chunk.content[:300]})
|
||||
|
||||
entrypoints = _normalize_entrypoints(entrypoint_rows, code_chunks)
|
||||
executed = list(report.get("executed_layers") or [])
|
||||
missing_layers = [lid for lid in executed if lid not in layers_seen]
|
||||
layer_outcomes = _layer_outcomes(raw_rows, executed, layers_seen, report)
|
||||
|
||||
resolved = str(sym.get("resolved_symbol") or "").strip() or None
|
||||
status = str(sym.get("status") or "not_requested")
|
||||
|
||||
return RetrievalResult(
|
||||
target_symbol_candidates=target_symbols,
|
||||
resolved_symbol=resolved,
|
||||
symbol_resolution_status=status,
|
||||
file_candidates=file_candidates,
|
||||
code_chunks=code_chunks,
|
||||
relations=relations,
|
||||
semantic_hints=semantic_hints,
|
||||
entrypoints=entrypoints,
|
||||
test_candidates=test_candidates,
|
||||
layer_outcomes=layer_outcomes,
|
||||
missing_layers=missing_layers,
|
||||
raw_rows=raw_rows,
|
||||
retrieval_report=report,
|
||||
)
|
||||
|
||||
|
||||
def _to_chunk(row: dict) -> CodeChunkItem:
|
||||
return CodeChunkItem(
|
||||
layer=str(row.get("layer") or ""),
|
||||
path=str(row.get("path") or ""),
|
||||
title=str(row.get("title") or ""),
|
||||
content=str(row.get("content") or ""),
|
||||
start_line=row.get("span_start"),
|
||||
end_line=row.get("span_end"),
|
||||
metadata=dict(row.get("metadata") or {}),
|
||||
)
|
||||
|
||||
|
||||
def _normalize_entrypoints(entrypoint_rows: list[CodeChunkItem], code_chunks: list[CodeChunkItem]) -> list[dict]:
|
||||
chunks_by_path: dict[str, list[CodeChunkItem]] = {}
|
||||
for chunk in code_chunks:
|
||||
chunks_by_path.setdefault(chunk.path, []).append(chunk)
|
||||
|
||||
normalized: list[dict] = []
|
||||
seen: set[tuple[str, str, str, int | None, int | None]] = set()
|
||||
for chunk in entrypoint_rows:
|
||||
item = _entrypoint_payload(chunk, chunks_by_path.get(chunk.path, []))
|
||||
key = (
|
||||
str(item.get("http_method") or ""),
|
||||
str(item.get("route_path") or ""),
|
||||
str(item.get("path") or ""),
|
||||
chunk.start_line,
|
||||
chunk.end_line,
|
||||
)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
normalized.append(item)
|
||||
return sorted(normalized, key=_entrypoint_sort_key)
|
||||
|
||||
|
||||
def _relation_payload(chunk: CodeChunkItem) -> dict:
|
||||
metadata = dict(chunk.metadata or {})
|
||||
edge_type = str(metadata.get("edge_type") or "").strip()
|
||||
src_qname = str(metadata.get("src_qname") or "").strip()
|
||||
dst_ref = str(metadata.get("dst_ref") or "").strip()
|
||||
return {
|
||||
"path": chunk.path,
|
||||
"start_line": chunk.start_line,
|
||||
"end_line": chunk.end_line,
|
||||
"edge_type": edge_type,
|
||||
"source": src_qname,
|
||||
"target": dst_ref,
|
||||
"source_component": _component_name(src_qname),
|
||||
"target_component": _component_name(dst_ref),
|
||||
"has_retrieval_label": "dataflow_slice" in dst_ref or "dataflow_slice" in src_qname,
|
||||
"metadata": metadata,
|
||||
"content": chunk.content[:500],
|
||||
}
|
||||
|
||||
|
||||
def _semantic_hint_payload(chunk: CodeChunkItem) -> dict:
|
||||
metadata = dict(chunk.metadata or {})
|
||||
return {
|
||||
"path": chunk.path,
|
||||
"title": chunk.title,
|
||||
"symbol": metadata.get("symbol_name") or chunk.title,
|
||||
"role": metadata.get("role"),
|
||||
"confidence": metadata.get("confidence"),
|
||||
"content": chunk.content[:300],
|
||||
}
|
||||
|
||||
|
||||
def _component_name(value: str) -> str:
|
||||
cleaned = str(value or "").replace(".dataflow_slice", "").strip(". ")
|
||||
if "." not in cleaned:
|
||||
return cleaned
|
||||
head, tail = cleaned.rsplit(".", 1)
|
||||
return head if tail and not tail[:1].isupper() else cleaned
|
||||
|
||||
|
||||
def _entrypoint_payload(chunk: CodeChunkItem, siblings: list[CodeChunkItem]) -> dict:
|
||||
methods = [str(item).strip().upper() for item in list(dict(chunk.metadata).get("lang_payload", {}).get("methods", [])) if str(item).strip()]
|
||||
method = methods[0] if methods else ""
|
||||
route_path = _infer_route_path(chunk, siblings)
|
||||
handler_symbol = _infer_handler_symbol(chunk, siblings)
|
||||
return {
|
||||
"path": chunk.path,
|
||||
"title": chunk.title,
|
||||
"start_line": chunk.start_line,
|
||||
"end_line": chunk.end_line,
|
||||
"http_method": method,
|
||||
"route_path": route_path,
|
||||
"handler_symbol": handler_symbol,
|
||||
"handler_symbol_id": dict(chunk.metadata).get("handler_symbol_id"),
|
||||
"route_or_command": dict(chunk.metadata).get("route_or_command"),
|
||||
"metadata": chunk.metadata,
|
||||
}
|
||||
|
||||
|
||||
def _infer_route_path(chunk: CodeChunkItem, siblings: list[CodeChunkItem]) -> str:
|
||||
for candidate in siblings:
|
||||
if candidate.layer != "C0_SOURCE_CHUNKS":
|
||||
continue
|
||||
match = _ROUTE_RE.search(_window_text(candidate, chunk.start_line))
|
||||
if match:
|
||||
return match.group(2).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _infer_handler_symbol(chunk: CodeChunkItem, siblings: list[CodeChunkItem]) -> str:
|
||||
for candidate in siblings:
|
||||
if candidate.layer != "C0_SOURCE_CHUNKS":
|
||||
continue
|
||||
match = _DEF_RE.search(_window_text(candidate, chunk.start_line, before=2, after=12))
|
||||
if match:
|
||||
return match.group(1) or match.group(2) or ""
|
||||
return ""
|
||||
|
||||
|
||||
def _window_text(chunk: CodeChunkItem, target_line: int | None, *, before: int = 6, after: int = 8) -> str:
|
||||
text = chunk.content or ""
|
||||
if not text or not target_line or not chunk.start_line:
|
||||
return text
|
||||
lines = text.splitlines()
|
||||
offset = max(0, target_line - chunk.start_line - before)
|
||||
end = min(len(lines), target_line - chunk.start_line + after)
|
||||
if offset >= end:
|
||||
return text
|
||||
return "\n".join(lines[offset:end])
|
||||
|
||||
|
||||
def _entrypoint_sort_key(item: dict) -> tuple[int, int, str, str, int]:
|
||||
route_path = str(item.get("route_path") or "")
|
||||
method = str(item.get("http_method") or "")
|
||||
score = 0
|
||||
if route_path:
|
||||
score += 3
|
||||
if route_path and route_path != "/":
|
||||
score += 3
|
||||
if method:
|
||||
score += 1
|
||||
return (-score, len(route_path or "zzzz"), method, str(item.get("path") or ""), int(item.get("start_line") or 0))
|
||||
|
||||
|
||||
def _layer_outcomes(raw_rows: list[dict], executed: list[str], layers_seen: set[str], report: dict) -> list[LayerOutcome]:
|
||||
layer_outcomes = [
|
||||
LayerOutcome(
|
||||
layer_id=layer_id,
|
||||
hit_count=sum(1 for r in raw_rows if str(r.get("layer") or "") == layer_id),
|
||||
empty=layer_id not in layers_seen,
|
||||
fallback_used=bool(
|
||||
(report.get("fallback") or {}).get("used")
|
||||
and report.get("retrieval_mode_by_layer", {}).get(layer_id)
|
||||
),
|
||||
fallback_used=bool((report.get("fallback") or {}).get("used") and report.get("retrieval_mode_by_layer", {}).get(layer_id)),
|
||||
)
|
||||
for layer_id in executed
|
||||
]
|
||||
@@ -80,21 +239,4 @@ def build_retrieval_result(
|
||||
fallback_used=False,
|
||||
)
|
||||
)
|
||||
|
||||
resolved = str(sym.get("resolved_symbol") or "").strip() or None
|
||||
status = str(sym.get("status") or "not_requested")
|
||||
|
||||
return RetrievalResult(
|
||||
target_symbol_candidates=target_symbols,
|
||||
resolved_symbol=resolved,
|
||||
symbol_resolution_status=status,
|
||||
file_candidates=file_candidates,
|
||||
code_chunks=code_chunks,
|
||||
relations=relations,
|
||||
entrypoints=entrypoints,
|
||||
test_candidates=test_candidates,
|
||||
layer_outcomes=layer_outcomes,
|
||||
missing_layers=missing_layers,
|
||||
raw_rows=raw_rows,
|
||||
retrieval_report=report,
|
||||
)
|
||||
return layer_outcomes
|
||||
|
||||
@@ -20,6 +20,6 @@ class GigaChatSettings:
|
||||
scope=os.getenv("GIGACHAT_SCOPE", "GIGACHAT_API_PERS"),
|
||||
credentials=os.getenv("GIGACHAT_TOKEN", "").strip(),
|
||||
ssl_verify=os.getenv("GIGACHAT_SSL_VERIFY", "true").lower() in {"1", "true", "yes"},
|
||||
model=os.getenv("GIGACHAT_MODEL", "GigaChat"),
|
||||
model=os.getenv("GIGACHAT_MODEL", "GigaChat-Pro"),
|
||||
embedding_model=os.getenv("GIGACHAT_EMBEDDING_MODEL", "Embeddings"),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user