ййй
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# Docs QA golden evaluation harness.
|
||||
@@ -0,0 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
class DocsEvalConfig:
|
||||
golden_cases_path: Path = Path("tests/golden/docs_qa/cases.yaml")
|
||||
pipeline_mode: str = "pre_llm_only"
|
||||
@@ -0,0 +1,108 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class InMemoryDocsRetrievalAdapter:
|
||||
def __init__(self, rows: list[dict]) -> None:
|
||||
self._rows = list(rows)
|
||||
self._report: dict = {}
|
||||
|
||||
def retrieve_with_plan(self, rag_session_id: str, query: str, retrieval_spec, retrieval_constraints=None, *, query_plan=None) -> list[dict]:
|
||||
planned_layers = [str(item.layer_id) for item in retrieval_spec.layer_queries]
|
||||
query_sub_intent = str(getattr(query_plan, "sub_intent", "") or "")
|
||||
relation_rows = [row for row in self._rows if str(row.get("layer") or "") == "D5_RELATION_GRAPH"]
|
||||
relation_hits = len(relation_rows)
|
||||
result: list[dict] = []
|
||||
executed_layers: list[str] = []
|
||||
layer_diagnostics: dict[str, dict] = {}
|
||||
fallback_used = False
|
||||
fallback_reason = None
|
||||
support_paths = [
|
||||
str(row.get("path") or "").strip()
|
||||
for row in self._rows
|
||||
if str(row.get("layer") or "") in {"D1_DOCUMENT_CATALOG", "D2_FACT_INDEX", "D3_ENTITY_CATALOG", "D4_WORKFLOW_INDEX", "D5_RELATION_GRAPH"}
|
||||
and str(row.get("path") or "").strip()
|
||||
]
|
||||
self._report = {
|
||||
"planned_layers": planned_layers,
|
||||
"executed_layers": executed_layers,
|
||||
"retrieval_mode_by_layer": {},
|
||||
"fallback": {"used": False},
|
||||
"layer_diagnostics": layer_diagnostics,
|
||||
}
|
||||
for item in retrieval_spec.layer_queries:
|
||||
layer_id = str(item.layer_id)
|
||||
if query_sub_intent == "RELATED_DOCS_EXPLAIN" and layer_id in {"D1_DOCUMENT_CATALOG", "D0_DOC_CHUNKS"} and relation_hits >= 2:
|
||||
layer_diagnostics[layer_id] = {"hits": 0, "top_ids": [], "skipped": True, "reason": "relation_primary_sufficient"}
|
||||
continue
|
||||
executed_layers.append(layer_id)
|
||||
layer_rows = [row for row in self._rows if str(row.get("layer") or "") == layer_id]
|
||||
result.extend(layer_rows)
|
||||
layer_diagnostics[layer_id] = self._layer_diagnostics(layer_rows)
|
||||
d2_empty = "D2_FACT_INDEX" in planned_layers and int(dict(layer_diagnostics.get("D2_FACT_INDEX") or {}).get("hits") or 0) == 0
|
||||
d0_empty = "D0_DOC_CHUNKS" in planned_layers and int(dict(layer_diagnostics.get("D0_DOC_CHUNKS") or {}).get("hits") or 0) == 0
|
||||
if support_paths and "D0_DOC_CHUNKS" in planned_layers and (d2_empty or d0_empty):
|
||||
targeted = [
|
||||
row for row in self._rows
|
||||
if str(row.get("layer") or "") == "D0_DOC_CHUNKS" and str(row.get("path") or "").strip() in support_paths
|
||||
]
|
||||
merged = self._dedupe([*result, *targeted])
|
||||
new_targeted = self._subtract_rows(merged, result)
|
||||
result = merged
|
||||
layer_diagnostics["D0_DOC_CHUNKS"] = self._layer_diagnostics(
|
||||
[row for row in result if str(row.get("layer") or "") == "D0_DOC_CHUNKS"]
|
||||
)
|
||||
if new_targeted:
|
||||
fallback_used = True
|
||||
fallback_reason = "targeted_chunk_retrieval"
|
||||
self._report["fallback"] = {"used": fallback_used, "reason": fallback_reason}
|
||||
return self._dedupe(result)
|
||||
|
||||
def retrieve_exact_files(self, rag_session_id: str, *, repo_id=None, paths: list[str], layers=None, limit: int = 200, query: str = "", ranking_profile: str = "") -> list[dict]:
|
||||
return []
|
||||
|
||||
def hydrate_resolved_symbol_sources(self, rag_session_id: str, base_query: str, rag_rows: list[dict], symbol_resolution: dict, retrieval_spec, retrieval_constraints=None) -> list[dict]:
|
||||
return list(rag_rows)
|
||||
|
||||
def force_symbol_context_c0(self, rag_session_id: str, *, rag_rows: list[dict], symbol_resolution: dict, limit: int = 20) -> list[dict]:
|
||||
return list(rag_rows)
|
||||
|
||||
def consume_retrieval_report(self) -> dict:
|
||||
return dict(self._report)
|
||||
|
||||
def _layer_diagnostics(self, rows: list[dict]) -> dict:
|
||||
top_ids: list[str] = []
|
||||
top_sections: list[str] = []
|
||||
for row in rows[:5]:
|
||||
metadata = dict(row.get("metadata") or {})
|
||||
candidate = metadata.get("document_id") or metadata.get("doc_id") or metadata.get("fact_id") or metadata.get("relation_id") or row.get("path")
|
||||
value = str(candidate or "").strip()
|
||||
if value and value not in top_ids:
|
||||
top_ids.append(value)
|
||||
title = str(row.get("title") or "").strip()
|
||||
if title and title not in top_sections:
|
||||
top_sections.append(title)
|
||||
return {"hits": len(rows), "top_ids": top_ids, "top_sections": top_sections}
|
||||
|
||||
def _dedupe(self, rows: list[dict]) -> list[dict]:
|
||||
result: list[dict] = []
|
||||
seen: set[tuple[str, str, str, int | None, int | None]] = set()
|
||||
for row in rows:
|
||||
key = self._row_key(row)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(row)
|
||||
return result
|
||||
|
||||
def _subtract_rows(self, rows: list[dict], baseline: list[dict]) -> list[dict]:
|
||||
baseline_keys = {self._row_key(row) for row in baseline}
|
||||
return [row for row in rows if self._row_key(row) not in baseline_keys]
|
||||
|
||||
def _row_key(self, row: dict) -> tuple[str, str, str, int | None, int | None]:
|
||||
return (
|
||||
str(row.get("layer") or ""),
|
||||
str(row.get("path") or ""),
|
||||
str(row.get("title") or ""),
|
||||
row.get("span_start"),
|
||||
row.get("span_end"),
|
||||
)
|
||||
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
class DocsGoldenCase:
|
||||
case_id: str
|
||||
query: str
|
||||
expected_intent: str
|
||||
expected_sub_intent: str
|
||||
expected_layers: tuple[str, ...]
|
||||
rows: tuple[dict, ...]
|
||||
|
||||
|
||||
def load_cases(path: Path) -> list[DocsGoldenCase]:
|
||||
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
items = payload.get("cases") or []
|
||||
return [
|
||||
DocsGoldenCase(
|
||||
case_id=str(item["id"]),
|
||||
query=str(item["query"]),
|
||||
expected_intent=str(item["expected_intent"]),
|
||||
expected_sub_intent=str(item["expected_sub_intent"]),
|
||||
expected_layers=tuple(item.get("expected_layers") or ()),
|
||||
rows=tuple(item.get("rows") or ()),
|
||||
)
|
||||
for item in items
|
||||
]
|
||||
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from tests.docs_qa_eval.config import DocsEvalConfig
|
||||
from tests.docs_qa_eval.runner import run_eval
|
||||
|
||||
|
||||
def main() -> int:
|
||||
results = run_eval(DocsEvalConfig())
|
||||
failed = [
|
||||
item.case.case_id
|
||||
for item in results
|
||||
if not (item.intent_ok and item.layers_ok and item.retrieval_non_empty and item.openapi_complete)
|
||||
]
|
||||
return 1 if failed else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.modules.agent.intent_router_v2 import IntentRouterV2
|
||||
from app.modules.agent.runtime.docs_qa_pipeline import DocsQAPipelineRunner
|
||||
from tests.docs_qa_eval.config import DocsEvalConfig
|
||||
from tests.docs_qa_eval.fixture_adapter import InMemoryDocsRetrievalAdapter
|
||||
from tests.docs_qa_eval.golden_loader import DocsGoldenCase, load_cases
|
||||
from tests.unit_tests.rag.intent_router_testkit import repo_context
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
class DocsEvalCaseResult:
|
||||
case: DocsGoldenCase
|
||||
intent_ok: bool
|
||||
layers_ok: bool
|
||||
retrieval_non_empty: bool
|
||||
openapi_complete: bool
|
||||
gate_ok: bool
|
||||
openapi_output_ok: bool
|
||||
|
||||
|
||||
def run_eval(config: DocsEvalConfig) -> list[DocsEvalCaseResult]:
|
||||
router = IntentRouterV2()
|
||||
cases = load_cases(config.golden_cases_path)
|
||||
results: list[DocsEvalCaseResult] = []
|
||||
for case in cases:
|
||||
pipeline = DocsQAPipelineRunner(router=router, retrieval_adapter=InMemoryDocsRetrievalAdapter(list(case.rows)), repo_context=repo_context())
|
||||
result = pipeline.run(case.query, rag_session_id="docs-test-session", mode=config.pipeline_mode)
|
||||
actual_layers = tuple(item.layer_id for item in result.router_result.retrieval_spec.layer_queries)
|
||||
diagnostics = result.diagnostics
|
||||
openapi_complete = True
|
||||
gate_ok = diagnostics.gate_decision in {"allow", "partial"}
|
||||
openapi_output_ok = True
|
||||
if case.expected_intent == "OPENAPI_GENERATION":
|
||||
openapi_complete = diagnostics.openapi_status["has_path"] and diagnostics.openapi_status["has_method"]
|
||||
gate_ok = diagnostics.gate_decision in {"allow", "partial"}
|
||||
openapi_output_ok = bool(result.answer.strip()) and ("paths:" in result.answer or "type: object" in result.answer)
|
||||
results.append(
|
||||
DocsEvalCaseResult(
|
||||
case=case,
|
||||
intent_ok=result.router_result.intent == case.expected_intent and result.router_result.query_plan.sub_intent == case.expected_sub_intent,
|
||||
layers_ok=actual_layers == case.expected_layers,
|
||||
retrieval_non_empty=bool(result.raw_rows),
|
||||
openapi_complete=openapi_complete,
|
||||
gate_ok=gate_ok,
|
||||
openapi_output_ok=openapi_output_ok,
|
||||
)
|
||||
)
|
||||
return results
|
||||
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.docs_qa_eval.config import DocsEvalConfig
|
||||
from tests.docs_qa_eval.golden_loader import load_cases
|
||||
from tests.docs_qa_eval.runner import run_eval
|
||||
|
||||
|
||||
pytestmark = pytest.mark.docs_qa_eval
|
||||
|
||||
|
||||
def test_load_cases_has_minimum_suite() -> None:
|
||||
cases = load_cases(DocsEvalConfig().golden_cases_path)
|
||||
assert len(cases) >= 35
|
||||
|
||||
|
||||
def test_run_eval_all_cases_pass_core_checks() -> None:
|
||||
results = run_eval(DocsEvalConfig())
|
||||
assert results
|
||||
assert all(item.intent_ok for item in results)
|
||||
assert all(item.layers_ok for item in results)
|
||||
assert all(item.retrieval_non_empty for item in results)
|
||||
assert all(item.gate_ok for item in results)
|
||||
assert all(item.openapi_complete for item in results if item.case.expected_intent == "OPENAPI_GENERATION")
|
||||
assert all(item.openapi_output_ok for item in results if item.case.expected_intent == "OPENAPI_GENERATION")
|
||||
Reference in New Issue
Block a user