ййй
This commit is contained in:
@@ -15,18 +15,9 @@ class ArtifactWriter:
|
||||
|
||||
def write_case(self, result: V3CaseResult) -> None:
|
||||
stem = f"{result.case.source_file.stem}_{result.case.case_id}"
|
||||
payload = {
|
||||
"case_id": result.case.case_id,
|
||||
"source_file": result.case.source_file.as_posix(),
|
||||
"runner": result.case.runner,
|
||||
"mode": result.case.mode,
|
||||
"query": result.case.query,
|
||||
"actual": result.actual,
|
||||
"passed": result.passed,
|
||||
"mismatches": result.mismatches,
|
||||
"details": result.details,
|
||||
}
|
||||
payload = self._json_payload(result)
|
||||
(self.run_dir / f"{stem}.json").write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
diagnostics = self._diagnostics_without_prompt(result.details)
|
||||
lines = [
|
||||
f"# {result.case.case_id}",
|
||||
"",
|
||||
@@ -41,17 +32,161 @@ class ArtifactWriter:
|
||||
"## Actual",
|
||||
json.dumps(result.actual, ensure_ascii=False, indent=2),
|
||||
"",
|
||||
"## Steps",
|
||||
json.dumps(result.details.get("steps") or [], ensure_ascii=False, indent=2),
|
||||
"## Pipeline Steps",
|
||||
*self._md_steps(result),
|
||||
"",
|
||||
"## Diagnostics",
|
||||
json.dumps(result.details.get("diagnostics") or {}, ensure_ascii=False, indent=2),
|
||||
json.dumps(diagnostics, ensure_ascii=False, indent=2),
|
||||
"",
|
||||
*self._llm_request_section(result.details),
|
||||
"",
|
||||
"## Mismatches",
|
||||
*([f"- {item}" for item in result.mismatches] or ["- none"]),
|
||||
]
|
||||
(self.run_dir / f"{stem}.md").write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
def _json_payload(self, result: V3CaseResult) -> dict:
|
||||
return {
|
||||
"meta": {
|
||||
"case_id": result.case.case_id,
|
||||
"source_file": result.case.source_file.as_posix(),
|
||||
"runner": result.case.runner,
|
||||
"mode": result.case.mode,
|
||||
"passed": result.passed,
|
||||
"mismatches": result.mismatches,
|
||||
"actual": result.actual,
|
||||
},
|
||||
"pipeline_steps": list(result.details.get("pipeline_steps") or []),
|
||||
}
|
||||
|
||||
def _md_steps(self, result: V3CaseResult) -> list[str]:
|
||||
steps = list(result.details.get("pipeline_steps") or [])
|
||||
if not steps:
|
||||
return ["- none"]
|
||||
lines: list[str] = []
|
||||
for item in steps:
|
||||
step = str(item.get("step") or "").strip() or "unknown"
|
||||
status = str(item.get("status") or "").strip()
|
||||
lines.append(f"### {step}")
|
||||
if status:
|
||||
lines.append(f"- status: {status}")
|
||||
timings = item.get("timings_ms") or {}
|
||||
if timings:
|
||||
lines.append(f"- timings_ms: {json.dumps(timings, ensure_ascii=False)}")
|
||||
lines.append("```json")
|
||||
lines.append(json.dumps({"input": item.get("input") or {}, "output": item.get("output") or {}}, ensure_ascii=False, indent=2))
|
||||
lines.append("```")
|
||||
lines.append("")
|
||||
return lines[:-1] if lines and not lines[-1] else lines
|
||||
|
||||
def _diagnostics_without_prompt(self, details: dict) -> dict:
|
||||
diagnostics = dict(details.get("diagnostics") or {})
|
||||
diagnostics.pop("prompt", None)
|
||||
return diagnostics
|
||||
|
||||
def _llm_request_section(self, details: dict) -> list[str]:
|
||||
llm_request = dict(details.get("llm_request") or {})
|
||||
if not llm_request:
|
||||
llm_request = dict((details.get("diagnostics") or {}).get("prompt") or {})
|
||||
if not llm_request:
|
||||
return []
|
||||
lines = [
|
||||
"## LLM Request",
|
||||
f"- prompt_name: {llm_request.get('prompt_name') or '—'}",
|
||||
f"- log_context: {llm_request.get('log_context') or '—'}",
|
||||
]
|
||||
prompt_stats = dict(llm_request.get("prompt_stats") or {})
|
||||
if prompt_stats:
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"### Prompt Stats",
|
||||
"```json",
|
||||
json.dumps(prompt_stats, ensure_ascii=False, indent=2),
|
||||
"```",
|
||||
]
|
||||
)
|
||||
system_prompt = str(llm_request.get("system_prompt") or "").strip()
|
||||
user_prompt = str(llm_request.get("user_prompt") or "").strip()
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"### System Prompt",
|
||||
"```text",
|
||||
system_prompt or "—",
|
||||
"```",
|
||||
"",
|
||||
"### User Prompt",
|
||||
*self._render_user_prompt(user_prompt),
|
||||
]
|
||||
)
|
||||
return lines
|
||||
|
||||
def _render_user_prompt(self, user_prompt: str) -> list[str]:
|
||||
payload = self._parse_json(user_prompt)
|
||||
if payload is None:
|
||||
return ["```text", user_prompt or "—", "```"]
|
||||
lines = ["```json", json.dumps(payload, ensure_ascii=False, indent=2), "```"]
|
||||
lines.extend(self._prompt_overview(payload))
|
||||
return lines
|
||||
|
||||
def _prompt_overview(self, payload: dict) -> list[str]:
|
||||
lines = ["", "### User Prompt Overview"]
|
||||
for key in ("question", "intent", "sub_intent"):
|
||||
value = payload.get(key)
|
||||
if value is not None:
|
||||
lines.append(f"- {key}: {value}")
|
||||
lines.extend(self._prompt_collection_line("documents", payload.get("documents")))
|
||||
lines.extend(self._prompt_collection_line("facts", payload.get("facts")))
|
||||
lines.extend(self._prompt_collection_line("relations", payload.get("relations")))
|
||||
api_contract = payload.get("api_contract")
|
||||
if isinstance(api_contract, dict):
|
||||
lines.append("- api_contract:")
|
||||
lines.extend(self._api_contract_lines(api_contract))
|
||||
return lines
|
||||
|
||||
def _prompt_collection_line(self, name: str, value) -> list[str]:
|
||||
items = value if isinstance(value, list) else []
|
||||
if not items:
|
||||
return [f"- {name}: 0"]
|
||||
samples: list[str] = []
|
||||
for item in items[:3]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
sample = str(
|
||||
item.get("title")
|
||||
or item.get("content")
|
||||
or item.get("path")
|
||||
or item.get("doc_id")
|
||||
or item.get("id")
|
||||
or ""
|
||||
).strip()
|
||||
if sample:
|
||||
samples.append(" ".join(sample.split()))
|
||||
suffix = f" | samples: {', '.join(samples)}" if samples else ""
|
||||
return [f"- {name}: {len(items)}{suffix}"]
|
||||
|
||||
def _api_contract_lines(self, api_contract: dict) -> list[str]:
|
||||
lines: list[str] = []
|
||||
path = str(api_contract.get("path") or "").strip() or "—"
|
||||
method = str(api_contract.get("method") or "").strip() or "—"
|
||||
request_schema = api_contract.get("request_schema")
|
||||
response_schema = api_contract.get("response_schema")
|
||||
lines.append(f" path: {path}")
|
||||
lines.append(f" method: {method}")
|
||||
lines.append(f" has_request_schema: {bool(request_schema)}")
|
||||
lines.append(f" has_response_schema: {bool(response_schema)}")
|
||||
return lines
|
||||
|
||||
def _parse_json(self, text: str) -> dict | None:
|
||||
if not text.strip():
|
||||
return None
|
||||
try:
|
||||
payload = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return payload if isinstance(payload, dict) else None
|
||||
|
||||
def write_summary(self, results: list[V3CaseResult]) -> Path:
|
||||
path = self.run_dir / "summary.md"
|
||||
path.write_text(SummaryComposer().compose(results), encoding="utf-8")
|
||||
|
||||
@@ -94,15 +94,21 @@ class CaseDirectoryLoader:
|
||||
else None
|
||||
),
|
||||
path_scope_contains=tuple(str(item) for item in retrieval.get("path_scope_contains") or []),
|
||||
doc_scope_contains=tuple(str(item) for item in retrieval.get("doc_scope_contains") or []),
|
||||
symbol_candidates_contain=tuple(str(item) for item in retrieval.get("symbol_candidates_contain") or []),
|
||||
entity_candidates_contain=tuple(str(item) for item in retrieval.get("entity_candidates_contain") or []),
|
||||
layers_include=tuple(str(item) for item in retrieval.get("layers_include") or []),
|
||||
filters_contain={str(k): str(v) for k, v in dict(retrieval.get("filters_contain") or {}).items()},
|
||||
),
|
||||
llm=LlmExpectation(
|
||||
non_empty=llm.get("non_empty"),
|
||||
contains_all=tuple(str(item) for item in llm.get("contains_all") or []),
|
||||
excludes=tuple(str(item) for item in llm.get("excludes") or []),
|
||||
answer_mode=str(llm.get("answer_mode") or "").strip() or None,
|
||||
),
|
||||
pipeline=PipelineExpectation(
|
||||
answer_mode=str(pipeline.get("answer_mode") or llm.get("answer_mode") or "").strip() or None
|
||||
),
|
||||
pipeline=PipelineExpectation(answer_mode=str(pipeline.get("answer_mode") or "").strip() or None),
|
||||
)
|
||||
|
||||
def _normalize_runner(self, value: str) -> str:
|
||||
|
||||
@@ -30,8 +30,11 @@ class RetrievalExpectation:
|
||||
min_rows: int | None = None
|
||||
direct_symbol_test_hits_max: int | None = None
|
||||
path_scope_contains: tuple[str, ...] = ()
|
||||
doc_scope_contains: tuple[str, ...] = ()
|
||||
symbol_candidates_contain: tuple[str, ...] = ()
|
||||
entity_candidates_contain: tuple[str, ...] = ()
|
||||
layers_include: tuple[str, ...] = ()
|
||||
filters_contain: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
@@ -39,6 +42,7 @@ class LlmExpectation:
|
||||
non_empty: bool | None = None
|
||||
contains_all: tuple[str, ...] = ()
|
||||
excludes: tuple[str, ...] = ()
|
||||
answer_mode: str | None = None
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
|
||||
@@ -11,8 +11,18 @@ from tests.pipeline_setup_v3.core.validators import CaseValidator
|
||||
|
||||
|
||||
class V3Runner:
|
||||
def __init__(self, cases_dir: Path, results_dir: Path, run_name: str) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
cases_dir: Path,
|
||||
results_dir: Path,
|
||||
run_name: str,
|
||||
*,
|
||||
pipeline_mode: str = "full",
|
||||
router_llm_mode: str = "deterministic",
|
||||
) -> None:
|
||||
self._cases_dir = cases_dir
|
||||
self._pipeline_mode = pipeline_mode
|
||||
self._router_llm_mode = router_llm_mode
|
||||
self._validator = CaseValidator()
|
||||
self._sessions = RagSessionProvider()
|
||||
self._agent_runtime = None
|
||||
@@ -48,5 +58,8 @@ class V3Runner:
|
||||
if self._agent_runtime is None:
|
||||
from tests.pipeline_setup_v3.runtime.agent_runtime_adapter import AgentRuntimeAdapter
|
||||
|
||||
self._agent_runtime = AgentRuntimeAdapter()
|
||||
self._agent_runtime = AgentRuntimeAdapter(
|
||||
pipeline_mode=self._pipeline_mode,
|
||||
router_llm_mode=self._router_llm_mode,
|
||||
)
|
||||
return self._agent_runtime
|
||||
|
||||
@@ -25,7 +25,7 @@ class RagSessionProvider:
|
||||
|
||||
def _build_indexer(self):
|
||||
from app.modules.rag.persistence.repository import RagRepository
|
||||
from tests.pipeline_setup.utils.rag_indexer import RagSessionIndexer
|
||||
from tests.pipeline_setup_v3.shared.rag_indexer import RagSessionIndexer
|
||||
|
||||
if self._repository is None:
|
||||
self._repository = RagRepository()
|
||||
|
||||
@@ -39,14 +39,26 @@ class CaseValidator:
|
||||
for path in retrieval.path_scope_contains:
|
||||
if not any(path in item for item in actual.get("path_scope") or ()):
|
||||
mismatches.append(f"path_scope should contain '{path}', got {list(actual.get('path_scope') or ())}")
|
||||
for path in retrieval.doc_scope_contains:
|
||||
if not any(path in item for item in actual.get("doc_scope") or ()):
|
||||
mismatches.append(f"doc_scope should contain '{path}', got {list(actual.get('doc_scope') or ())}")
|
||||
for symbol in retrieval.symbol_candidates_contain:
|
||||
if symbol not in (actual.get("symbol_candidates") or ()):
|
||||
mismatches.append(f"symbol_candidates should contain '{symbol}', got {list(actual.get('symbol_candidates') or ())}")
|
||||
for entity in retrieval.entity_candidates_contain:
|
||||
if entity not in (actual.get("entity_candidates") or ()):
|
||||
mismatches.append(f"entity_candidates should contain '{entity}', got {list(actual.get('entity_candidates') or ())}")
|
||||
for layer in retrieval.layers_include:
|
||||
if layer not in (actual.get("layers") or ()):
|
||||
mismatches.append(f"layers should include '{layer}', got {list(actual.get('layers') or ())}")
|
||||
for key, value in retrieval.filters_contain.items():
|
||||
actual_filters = dict(actual.get("filters") or {})
|
||||
if str(actual_filters.get(key) or "") != value:
|
||||
mismatches.append(f"filters[{key}] should be '{value}', got '{actual_filters.get(key)}'")
|
||||
|
||||
def _check_llm(self, case: V3Case, actual: dict, mismatches: list[str]) -> None:
|
||||
if str(actual.get("pipeline_mode") or "") == "pre_llm_only":
|
||||
return
|
||||
llm = case.expectations.llm
|
||||
answer = str(actual.get("llm_answer") or "").strip()
|
||||
if llm.non_empty and not answer:
|
||||
|
||||
Reference in New Issue
Block a user