Фикс состояния
This commit is contained in:
@@ -0,0 +1,35 @@
|
||||
# CODE_QA evaluation harness
|
||||
|
||||
Runs the canonical CODE_QA pipeline (IntentRouterV2 → retrieval → evidence gate → diagnostics) over golden cases and writes artifacts for calibration.
|
||||
|
||||
## Modes
|
||||
|
||||
- **Fixture (default):** Uses `tests/pipeline_setup/suite_01_synthetic/fixtures/code_qa_repo`. No env vars required.
|
||||
- **Local repo:** Set `CODE_QA_REPO_PATH` to a directory; optionally `CODE_QA_PROJECT_ID`.
|
||||
|
||||
## Run
|
||||
|
||||
From the **project root** (agent repo):
|
||||
|
||||
```bash
|
||||
python -m tests.pipeline_setup.suite_01_synthetic.code_qa_eval.run
|
||||
```
|
||||
|
||||
Requires a configured database (same as pipeline_intent_rag router_rag tests). Outputs:
|
||||
|
||||
- `tests/pipeline_setup/test_results/code_qa_eval/<run_id>/*.md` and `*.json` per case
|
||||
- `tests/pipeline_setup/test_results/code_qa_eval/summary_<run_id>.md` batch summary
|
||||
|
||||
Exit code 0 if all golden cases pass, 1 otherwise.
|
||||
|
||||
## Golden cases
|
||||
|
||||
Edit `tests/pipeline_setup/suite_01_synthetic/golden/code_qa/cases.yaml` to add or change cases. See `tests/pipeline_setup/suite_01_synthetic/golden/code_qa/README.md` for the field format.
|
||||
|
||||
## Tests
|
||||
|
||||
```bash
|
||||
pytest tests/pipeline_setup/suite_01_synthetic/code_qa_eval/ -v
|
||||
```
|
||||
|
||||
The fixture-mode integration test (`test_run_eval_fixture_mode_structure`) is skipped if the DB or dependencies are not available.
|
||||
@@ -0,0 +1 @@
|
||||
"""CODE_QA pipeline calibration: golden runner, diagnostics artifacts, fixture and real-repo support."""
|
||||
@@ -0,0 +1,152 @@
|
||||
"""Write diagnostics artifacts and batch summary for CODE_QA evaluation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.runner import EvalCaseResult
|
||||
|
||||
|
||||
def dump_run_artifact(
|
||||
result: EvalCaseResult,
|
||||
out_dir: Path,
|
||||
*,
|
||||
run_id: str = "",
|
||||
) -> None:
|
||||
"""Write one run: markdown summary and JSON detail for manual review."""
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
prefix = result.case.id
|
||||
if run_id:
|
||||
prefix = f"{run_id}_{prefix}"
|
||||
|
||||
md_path = out_dir / f"{prefix}.md"
|
||||
md_path.write_text(_run_markdown(result), encoding="utf-8")
|
||||
|
||||
json_path = out_dir / f"{prefix}.json"
|
||||
json_path.write_text(
|
||||
json.dumps(_run_json(result), ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def _run_markdown(result: EvalCaseResult) -> str:
|
||||
c = result.case
|
||||
p = result.pipeline_result
|
||||
dr = p.diagnostics_report
|
||||
lines = [
|
||||
f"# {c.id}",
|
||||
"",
|
||||
"## Query",
|
||||
c.query,
|
||||
"",
|
||||
"## Expected",
|
||||
f"- intent: {c.expected_intent}, sub_intent: {c.expected_sub_intent}",
|
||||
f"- answer_mode: {c.expected_answer_mode}",
|
||||
"",
|
||||
"## Actual",
|
||||
f"- intent: {p.router_result.intent}, sub_intent: {p.router_result.query_plan.sub_intent if p.router_result.query_plan else '—'}",
|
||||
f"- answer_mode: {p.answer_mode}",
|
||||
f"- evidence_gate_passed: {p.evidence_gate_passed}",
|
||||
f"- evidence_count: {p.evidence_bundle.evidence_count}",
|
||||
"",
|
||||
"## Result",
|
||||
"PASS" if result.passed else "FAIL",
|
||||
"",
|
||||
]
|
||||
if result.mismatches:
|
||||
lines.append("## Mismatches")
|
||||
for m in result.mismatches:
|
||||
lines.append(f"- {m}")
|
||||
lines.append("")
|
||||
lines.extend([
|
||||
"## Router",
|
||||
f"- path_scope: {list(getattr(p.router_result.retrieval_spec.filters, 'path_scope', []) or [])}",
|
||||
f"- layers: {[q.layer_id for q in (p.router_result.retrieval_spec.layer_queries or [])]}",
|
||||
"",
|
||||
"## Retrieval",
|
||||
f"- requested_layers: {p.retrieval_request.requested_layers}",
|
||||
f"- chunk_count: {len(p.retrieval_result.code_chunks)}",
|
||||
f"- layer_outcomes: {[(o.layer_id, o.hit_count) for o in p.retrieval_result.layer_outcomes]}",
|
||||
"",
|
||||
"## Evidence gate",
|
||||
f"- failure_reasons: {dr.failure_reasons if dr else []}",
|
||||
"",
|
||||
"## Timings (ms)",
|
||||
f"{p.timings_ms}",
|
||||
"",
|
||||
])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _run_json(result: EvalCaseResult) -> dict:
|
||||
c = result.case
|
||||
p = result.pipeline_result
|
||||
dr = p.diagnostics_report
|
||||
return {
|
||||
"case_id": c.id,
|
||||
"query": c.query,
|
||||
"expected": {
|
||||
"intent": c.expected_intent,
|
||||
"sub_intent": c.expected_sub_intent,
|
||||
"answer_mode": c.expected_answer_mode,
|
||||
},
|
||||
"actual": {
|
||||
"intent": p.router_result.intent,
|
||||
"sub_intent": p.router_result.query_plan.sub_intent if p.router_result.query_plan else None,
|
||||
"answer_mode": p.answer_mode,
|
||||
"evidence_gate_passed": p.evidence_gate_passed,
|
||||
"evidence_count": p.evidence_bundle.evidence_count,
|
||||
},
|
||||
"passed": result.passed,
|
||||
"mismatches": result.mismatches,
|
||||
"router_result": dr.router_result if dr else {},
|
||||
"retrieval_request": dr.retrieval_request if dr else {},
|
||||
"per_layer_outcome": dr.per_layer_outcome if dr else [],
|
||||
"failure_reasons": dr.failure_reasons if dr else [],
|
||||
"timings_ms": p.timings_ms,
|
||||
}
|
||||
|
||||
|
||||
def write_batch_summary(
|
||||
results: list[EvalCaseResult],
|
||||
out_dir: Path,
|
||||
*,
|
||||
run_id: str = "",
|
||||
) -> Path:
|
||||
"""Write a single readable batch summary; returns path to the file."""
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
total = len(results)
|
||||
stamp = run_id or datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
path = out_dir / f"summary_{stamp}.md"
|
||||
lines = [
|
||||
"# CODE_QA evaluation summary",
|
||||
"",
|
||||
f"**{passed}/{total}** cases passed.",
|
||||
"",
|
||||
"| Case ID | Query | Expected scenario | Actual scenario | Target | Evidence | Answer mode | Pass |",
|
||||
"|---------|-------|------------------|-----------------|--------|----------|-------------|------|",
|
||||
]
|
||||
for r in results:
|
||||
c = r.case
|
||||
p = r.pipeline_result
|
||||
sub = p.router_result.query_plan.sub_intent if p.router_result.query_plan else "—"
|
||||
target = "—"
|
||||
if p.evidence_bundle.resolved_target:
|
||||
target = p.evidence_bundle.resolved_target[:40] + ("…" if len(p.evidence_bundle.resolved_target or "") > 40 else "")
|
||||
ev = "✓" if p.evidence_gate_passed else "✗"
|
||||
mode = p.answer_mode
|
||||
pass_mark = "✓" if r.passed else "✗"
|
||||
q_short = c.query[:40] + ("…" if len(c.query) > 40 else "")
|
||||
lines.append(
|
||||
f"| {c.id} | {q_short} | {c.expected_sub_intent} | {sub} | {target} | {ev} | {mode} | {pass_mark} |"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("## Failures")
|
||||
for r in results:
|
||||
if not r.passed and r.mismatches:
|
||||
lines.append(f"- **{r.case.id}**: {'; '.join(r.mismatches)}")
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
return path
|
||||
@@ -0,0 +1,41 @@
|
||||
"""Eval harness config: fixture vs user-provided repo path, artifact output."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class EvalConfig:
|
||||
"""Configuration for CODE_QA evaluation runs."""
|
||||
|
||||
repo_path: Path
|
||||
test_results_dir: Path
|
||||
golden_cases_path: Path
|
||||
project_id: str | None = None
|
||||
use_fixture: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, project_root: Path | None = None) -> "EvalConfig":
|
||||
"""Build config: fixture repo by default; optional CODE_QA_REPO_PATH for local calibration."""
|
||||
root = project_root or Path(__file__).resolve().parent.parent
|
||||
raw_repo = os.getenv("CODE_QA_REPO_PATH", "").strip()
|
||||
if raw_repo:
|
||||
repo_path = Path(raw_repo).expanduser().resolve()
|
||||
use_fixture = False
|
||||
project_id = os.getenv("CODE_QA_PROJECT_ID", "").strip() or repo_path.name
|
||||
else:
|
||||
repo_path = root / "fixtures" / "code_qa_repo"
|
||||
use_fixture = True
|
||||
project_id = "code_qa_repo"
|
||||
test_results_dir = root / "test_results" / "code_qa_eval"
|
||||
golden_cases_path = root / "golden" / "code_qa" / "cases.yaml"
|
||||
return cls(
|
||||
repo_path=repo_path,
|
||||
test_results_dir=test_results_dir,
|
||||
golden_cases_path=golden_cases_path,
|
||||
project_id=project_id,
|
||||
use_fixture=use_fixture,
|
||||
)
|
||||
@@ -0,0 +1,51 @@
|
||||
"""Load golden cases from YAML for CODE_QA evaluation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class GoldenCase:
|
||||
"""Single golden case for pipeline evaluation."""
|
||||
|
||||
id: str
|
||||
query: str
|
||||
expected_intent: str
|
||||
expected_sub_intent: str
|
||||
expected_answer_mode: str = "normal"
|
||||
expected_target_hint: str | None = None
|
||||
expected_path_scope_contains: list[str] = field(default_factory=list)
|
||||
expected_symbol_candidates_contain: list[str] = field(default_factory=list)
|
||||
expected_layers: list[str] = field(default_factory=list)
|
||||
notes: str = ""
|
||||
|
||||
|
||||
def load_golden_cases(path: Path) -> list[GoldenCase]:
|
||||
"""Load and parse golden cases from YAML. Returns list of GoldenCase."""
|
||||
if not path.exists():
|
||||
return []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
cases_raw = data.get("cases") or []
|
||||
out: list[GoldenCase] = []
|
||||
for c in cases_raw:
|
||||
if not isinstance(c, dict) or not c.get("id") or not c.get("query"):
|
||||
continue
|
||||
out.append(GoldenCase(
|
||||
id=str(c["id"]),
|
||||
query=str(c["query"]),
|
||||
expected_intent=str(c.get("expected_intent", "CODE_QA")),
|
||||
expected_sub_intent=str(c.get("expected_sub_intent", "EXPLAIN")),
|
||||
expected_answer_mode=str(c.get("expected_answer_mode", "normal")),
|
||||
expected_target_hint=c.get("expected_target_hint"),
|
||||
expected_path_scope_contains=list(c.get("expected_path_scope_contains") or []),
|
||||
expected_symbol_candidates_contain=list(c.get("expected_symbol_candidates_contain") or []),
|
||||
expected_layers=list(c.get("expected_layers") or []),
|
||||
notes=str(c.get("notes") or ""),
|
||||
))
|
||||
return out
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Entrypoint: run CODE_QA golden evaluation and write artifacts + summary."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
_agent_root = Path(__file__).resolve().parents[4]
|
||||
if str(_agent_root) not in sys.path:
|
||||
sys.path.insert(0, str(_agent_root))
|
||||
_src = _agent_root / "src"
|
||||
if _src.exists() and str(_src) not in sys.path:
|
||||
sys.path.insert(0, str(_src))
|
||||
|
||||
# Load .env from project root so DATABASE_URL is available
|
||||
from app.modules.shared.env_loader import load_workspace_env
|
||||
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.artifacts import dump_run_artifact, write_batch_summary
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.config import EvalConfig
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.runner import run_eval
|
||||
|
||||
|
||||
class _TeeStream:
|
||||
def __init__(self, *streams) -> None:
|
||||
self._streams = streams
|
||||
|
||||
def write(self, data: str) -> int:
|
||||
for stream in self._streams:
|
||||
stream.write(data)
|
||||
return len(data)
|
||||
|
||||
def flush(self) -> None:
|
||||
for stream in self._streams:
|
||||
stream.flush()
|
||||
|
||||
|
||||
def _check_db_available() -> bool:
|
||||
"""Try to connect to the database; return False if unavailable."""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
from app.modules.shared.db import get_engine
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(text("SELECT 1"))
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def main() -> None:
|
||||
# Workspace root (agent repo) for .env; pipeline_setup root for fixtures/golden/test_results
|
||||
workspace_root = Path(__file__).resolve().parents[3]
|
||||
pipeline_root = Path(__file__).resolve().parents[1]
|
||||
load_workspace_env(workspace_root)
|
||||
config = EvalConfig.from_env(project_root=pipeline_root)
|
||||
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
results_run_dir = config.test_results_dir / run_id
|
||||
results_run_dir.mkdir(parents=True, exist_ok=True)
|
||||
console_log_path = results_run_dir / "console_output.txt"
|
||||
|
||||
original_stdout = sys.stdout
|
||||
original_stderr = sys.stderr
|
||||
with console_log_path.open("w", encoding="utf-8") as log_file:
|
||||
sys.stdout = _TeeStream(original_stdout, log_file)
|
||||
sys.stderr = _TeeStream(original_stderr, log_file)
|
||||
try:
|
||||
_run_with_logging(config, results_run_dir, run_id)
|
||||
finally:
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
sys.stdout = original_stdout
|
||||
sys.stderr = original_stderr
|
||||
|
||||
|
||||
def _run_with_logging(config: EvalConfig, results_run_dir: Path, run_id: str) -> None:
|
||||
print(f"Console log: {results_run_dir / 'console_output.txt'}")
|
||||
|
||||
if not config.repo_path.exists():
|
||||
print(f"Repo path not found: {config.repo_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if not config.golden_cases_path.exists():
|
||||
print(f"Golden cases not found: {config.golden_cases_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not _check_db_available():
|
||||
print(
|
||||
"Database is not available. Evaluation requires a running PostgreSQL instance.\n"
|
||||
"Set DATABASE_URL (e.g. postgresql+psycopg://user:pass@localhost:5432/db) or start the DB (e.g. docker-compose up -d db).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Running evaluation: repo={config.repo_path}, fixture={config.use_fixture}")
|
||||
print(f"Results: {results_run_dir}")
|
||||
|
||||
results = run_eval(config)
|
||||
for r in results:
|
||||
dump_run_artifact(r, results_run_dir, run_id=run_id)
|
||||
|
||||
summary_path = write_batch_summary(results, config.test_results_dir, run_id=run_id)
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
total = len(results)
|
||||
print(f"\n{passed}/{total} cases passed. Summary: {summary_path}")
|
||||
for r in results:
|
||||
if not r.passed:
|
||||
print(f" FAIL {r.case.id}: {'; '.join(r.mismatches)}")
|
||||
sys.exit(0 if passed == total else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Run golden cases through CodeQAPipelineRunner and compare to expected."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from app.modules.rag.code_qa_pipeline import CodeQAPipelineResult, CodeQAPipelineRunner
|
||||
from app.modules.rag.contracts.enums import RagLayer
|
||||
from app.modules.rag.intent_router_v2 import ConversationState, IntentRouterV2, RepoContext
|
||||
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.config import EvalConfig
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.golden_loader import GoldenCase, load_golden_cases
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class EvalCaseResult:
|
||||
"""Result of evaluating one golden case."""
|
||||
|
||||
case: GoldenCase
|
||||
pipeline_result: CodeQAPipelineResult
|
||||
passed: bool
|
||||
mismatches: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def _repo_context() -> RepoContext:
|
||||
return RepoContext(
|
||||
languages=["python"],
|
||||
available_domains=["CODE", "DOCS"],
|
||||
available_layers=[
|
||||
RagLayer.CODE_ENTRYPOINTS,
|
||||
RagLayer.CODE_SYMBOL_CATALOG,
|
||||
RagLayer.CODE_DEPENDENCY_GRAPH,
|
||||
RagLayer.CODE_SEMANTIC_ROLES,
|
||||
RagLayer.CODE_SOURCE_CHUNKS,
|
||||
RagLayer.DOCS_MODULE_CATALOG,
|
||||
RagLayer.DOCS_FACT_INDEX,
|
||||
RagLayer.DOCS_SECTION_INDEX,
|
||||
RagLayer.DOCS_POLICY_INDEX,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def run_eval(config: EvalConfig) -> list[EvalCaseResult]:
|
||||
"""Index repo, run all golden cases through the pipeline, compare to expected. Returns list of EvalCaseResult."""
|
||||
from app.modules.rag.persistence.repository import RagRepository
|
||||
from tests.pipeline_setup.suite_02_pipeline.pipeline_intent_rag.helpers.rag_db_adapter import RagDbAdapter, SessionEmbeddingDimensions
|
||||
from tests.pipeline_setup.utils.rag_indexer import RagSessionIndexer
|
||||
|
||||
repo = RagRepository()
|
||||
repo.ensure_tables()
|
||||
indexer = RagSessionIndexer(repo)
|
||||
rag_session_id = indexer.index_repo(config.repo_path, project_id=config.project_id)
|
||||
|
||||
adapter = RagDbAdapter(repository=repo, dim_resolver=SessionEmbeddingDimensions())
|
||||
router = IntentRouterV2()
|
||||
runner = CodeQAPipelineRunner(
|
||||
router=router,
|
||||
retrieval_adapter=adapter,
|
||||
repo_context=_repo_context(),
|
||||
)
|
||||
|
||||
cases = load_golden_cases(config.golden_cases_path)
|
||||
|
||||
results: list[EvalCaseResult] = []
|
||||
for case in cases:
|
||||
pipeline_result = runner.run(case.query, rag_session_id, run_retrieval=True, run_hydrate=True)
|
||||
passed, mismatches = _compare(case, pipeline_result)
|
||||
results.append(EvalCaseResult(case=case, pipeline_result=pipeline_result, passed=passed, mismatches=mismatches))
|
||||
return results
|
||||
|
||||
|
||||
def _compare(case: GoldenCase, result: CodeQAPipelineResult) -> tuple[bool, list[str]]:
|
||||
mismatches: list[str] = []
|
||||
rr = result.router_result
|
||||
sub_intent = (rr.query_plan.sub_intent if rr.query_plan else None) or ""
|
||||
|
||||
if rr.intent != case.expected_intent:
|
||||
mismatches.append(f"intent: expected {case.expected_intent}, got {rr.intent}")
|
||||
if sub_intent != case.expected_sub_intent:
|
||||
mismatches.append(f"sub_intent: expected {case.expected_sub_intent}, got {sub_intent}")
|
||||
if result.answer_mode != case.expected_answer_mode:
|
||||
mismatches.append(f"answer_mode: expected {case.expected_answer_mode}, got {result.answer_mode}")
|
||||
|
||||
if case.expected_path_scope_contains:
|
||||
path_scope = list(getattr(rr.retrieval_spec.filters, "path_scope", []) or [])
|
||||
for want in case.expected_path_scope_contains:
|
||||
if not any(want in p for p in path_scope):
|
||||
mismatches.append(f"path_scope should contain '{want}', got {path_scope}")
|
||||
if case.expected_symbol_candidates_contain:
|
||||
candidates = list(rr.query_plan.symbol_candidates or []) if rr.query_plan else []
|
||||
for want in case.expected_symbol_candidates_contain:
|
||||
if want not in candidates:
|
||||
mismatches.append(f"symbol_candidates should contain '{want}', got {candidates}")
|
||||
if case.expected_layers:
|
||||
layers = [str(q.layer_id) for q in (rr.retrieval_spec.layer_queries or [])]
|
||||
for want in case.expected_layers:
|
||||
if want not in layers:
|
||||
mismatches.append(f"layers should include '{want}', got {layers}")
|
||||
|
||||
passed = len(mismatches) == 0
|
||||
return passed, mismatches
|
||||
@@ -0,0 +1,189 @@
|
||||
"""Tests for CODE_QA evaluation harness: golden loader, compare logic, fixture-mode run."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from app.modules.rag.code_qa_pipeline import CodeQAPipelineResult
|
||||
from app.modules.rag.intent_router_v2.models import (
|
||||
CodeRetrievalFilters,
|
||||
EvidencePolicy,
|
||||
IntentRouterResult,
|
||||
QueryPlan,
|
||||
RetrievalSpec,
|
||||
SymbolResolution,
|
||||
)
|
||||
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.config import EvalConfig
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.golden_loader import GoldenCase, load_golden_cases
|
||||
from tests.pipeline_setup.suite_01_synthetic.code_qa_eval.runner import EvalCaseResult, _compare, run_eval
|
||||
|
||||
_TEST_ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def test_load_golden_cases_returns_list() -> None:
|
||||
path = _TEST_ROOT / "golden" / "code_qa" / "cases.yaml"
|
||||
if not path.exists():
|
||||
pytest.skip("Golden cases file not found")
|
||||
cases = load_golden_cases(path)
|
||||
assert isinstance(cases, list)
|
||||
assert len(cases) >= 1
|
||||
c = cases[0]
|
||||
assert c.id
|
||||
assert c.query
|
||||
assert c.expected_intent in ("CODE_QA", "DOCS_QA")
|
||||
assert c.expected_sub_intent in ("OPEN_FILE", "EXPLAIN", "FIND_TESTS", "FIND_ENTRYPOINTS", "GENERAL_QA")
|
||||
assert c.expected_answer_mode in ("normal", "degraded", "insufficient")
|
||||
|
||||
|
||||
def test_compare_passed_when_all_match() -> None:
|
||||
case = GoldenCase(
|
||||
id="test",
|
||||
query="Open app/main.py",
|
||||
expected_intent="CODE_QA",
|
||||
expected_sub_intent="OPEN_FILE",
|
||||
expected_answer_mode="normal",
|
||||
)
|
||||
result = _make_pipeline_result(intent="CODE_QA", sub_intent="OPEN_FILE", answer_mode="normal")
|
||||
passed, mismatches = _compare(case, result)
|
||||
assert passed is True
|
||||
assert len(mismatches) == 0
|
||||
|
||||
|
||||
def test_compare_fails_on_intent_mismatch() -> None:
|
||||
case = GoldenCase(
|
||||
id="test",
|
||||
query="Open app/main.py",
|
||||
expected_intent="CODE_QA",
|
||||
expected_sub_intent="OPEN_FILE",
|
||||
expected_answer_mode="normal",
|
||||
)
|
||||
result = _make_pipeline_result(intent="DOCS_QA", sub_intent="OPEN_FILE", answer_mode="normal")
|
||||
passed, mismatches = _compare(case, result)
|
||||
assert passed is False
|
||||
assert any("intent" in m for m in mismatches)
|
||||
|
||||
|
||||
def test_compare_fails_on_answer_mode_mismatch() -> None:
|
||||
case = GoldenCase(
|
||||
id="test",
|
||||
query="Explain NonExistent",
|
||||
expected_intent="CODE_QA",
|
||||
expected_sub_intent="EXPLAIN",
|
||||
expected_answer_mode="degraded",
|
||||
)
|
||||
result = _make_pipeline_result(intent="CODE_QA", sub_intent="EXPLAIN", answer_mode="normal")
|
||||
passed, mismatches = _compare(case, result)
|
||||
assert passed is False
|
||||
assert any("answer_mode" in m for m in mismatches)
|
||||
|
||||
|
||||
def test_compare_path_scope_contains() -> None:
|
||||
case = GoldenCase(
|
||||
id="test",
|
||||
query="Open app/main.py",
|
||||
expected_intent="CODE_QA",
|
||||
expected_sub_intent="OPEN_FILE",
|
||||
expected_path_scope_contains=["app/main.py"],
|
||||
)
|
||||
result = _make_pipeline_result(
|
||||
intent="CODE_QA",
|
||||
sub_intent="OPEN_FILE",
|
||||
path_scope=["app/main.py"],
|
||||
)
|
||||
passed, _ = _compare(case, result)
|
||||
assert passed
|
||||
case_bad = GoldenCase(
|
||||
id="test2",
|
||||
query="Open other",
|
||||
expected_intent="CODE_QA",
|
||||
expected_sub_intent="OPEN_FILE",
|
||||
expected_path_scope_contains=["app/main.py"],
|
||||
)
|
||||
result_bad = _make_pipeline_result(intent="CODE_QA", sub_intent="OPEN_FILE", path_scope=[])
|
||||
passed_bad, mismatches_bad = _compare(case_bad, result_bad)
|
||||
assert not passed_bad
|
||||
assert any("path_scope" in m for m in mismatches_bad)
|
||||
|
||||
|
||||
def test_eval_config_fixture_mode_by_default() -> None:
|
||||
config = EvalConfig.from_env(project_root=_TEST_ROOT)
|
||||
assert config.use_fixture is True
|
||||
assert "code_qa_repo" in str(config.repo_path)
|
||||
assert config.repo_path == _TEST_ROOT / "fixtures" / "code_qa_repo"
|
||||
assert config.golden_cases_path == _TEST_ROOT / "golden" / "code_qa" / "cases.yaml"
|
||||
assert config.test_results_dir == _TEST_ROOT / "test_results" / "code_qa_eval"
|
||||
|
||||
|
||||
def test_run_eval_fixture_mode_structure() -> None:
|
||||
"""Run full eval on fixture repo; validates harness path. Skips if DB/deps unavailable."""
|
||||
config = EvalConfig.from_env(project_root=_TEST_ROOT)
|
||||
if not config.repo_path.exists():
|
||||
pytest.skip("Fixture repo not found")
|
||||
if not config.golden_cases_path.exists():
|
||||
pytest.skip("Golden cases not found")
|
||||
try:
|
||||
results = run_eval(config)
|
||||
except Exception as e:
|
||||
msg = str(e).lower()
|
||||
if (
|
||||
"connect" in msg
|
||||
or "database" in msg
|
||||
or "engine" in msg
|
||||
or "modulenotfounderror" in msg
|
||||
or "sqlalchemy" in msg
|
||||
):
|
||||
pytest.skip(f"DB or dependencies not available: {e}")
|
||||
raise
|
||||
assert isinstance(results, list)
|
||||
assert len(results) >= 1
|
||||
for r in results:
|
||||
assert isinstance(r, EvalCaseResult)
|
||||
assert r.case is not None
|
||||
assert r.pipeline_result is not None
|
||||
assert isinstance(r.passed, bool)
|
||||
assert isinstance(r.mismatches, list)
|
||||
|
||||
|
||||
def _make_pipeline_result(
|
||||
*,
|
||||
intent: str = "CODE_QA",
|
||||
sub_intent: str = "EXPLAIN",
|
||||
answer_mode: str = "normal",
|
||||
path_scope: list[str] | None = None,
|
||||
) -> CodeQAPipelineResult:
|
||||
from app.modules.rag.code_qa_pipeline.contracts import (
|
||||
EvidenceBundle,
|
||||
RetrievalRequest,
|
||||
RetrievalResult,
|
||||
)
|
||||
|
||||
filters = CodeRetrievalFilters(path_scope=path_scope or [])
|
||||
router_result = IntentRouterResult(
|
||||
intent=intent,
|
||||
graph_id="CodeQAGraph",
|
||||
retrieval_profile="code",
|
||||
conversation_mode="START",
|
||||
query_plan=QueryPlan(raw="", normalized="", sub_intent=sub_intent),
|
||||
retrieval_spec=RetrievalSpec(filters=filters),
|
||||
symbol_resolution=SymbolResolution(),
|
||||
evidence_policy=EvidencePolicy(),
|
||||
)
|
||||
req = RetrievalRequest(rag_session_id="", query="", sub_intent=sub_intent, path_scope=path_scope or [])
|
||||
res = RetrievalResult()
|
||||
bundle = EvidenceBundle(resolved_sub_intent=sub_intent, evidence_count=1)
|
||||
return CodeQAPipelineResult(
|
||||
user_query="",
|
||||
rag_session_id="",
|
||||
router_result=router_result,
|
||||
retrieval_request=req,
|
||||
retrieval_result=res,
|
||||
evidence_bundle=bundle,
|
||||
evidence_gate_passed=(answer_mode == "normal"),
|
||||
answer_synthesis_input=None,
|
||||
diagnostics_report=None,
|
||||
answer_mode=answer_mode,
|
||||
timings_ms={},
|
||||
)
|
||||
Reference in New Issue
Block a user