фиксирую состояние

2026-04-07 21:41:27 +03:00
parent 7387e5cc51
commit f62fb678b8
52 changed files with 4073 additions and 316 deletions
@@ -0,0 +1,121 @@
+"""Run full `process v2` flow in the v4 harness.
+
+This module adapts the existing v3 `V2ProcessAdapter` so pipeline_setup_v4 can
+execute the real route -> retrieval -> evidence -> workflow LLM chain without
+duplicating runtime logic.
+"""
+
+from __future__ import annotations
+
+from tests.pipeline_setup_v3.core.models import CaseExpectations, CaseInput, V3Case
+from tests.pipeline_setup_v3.runtime.v2_process_adapter import V2ProcessAdapter
+from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
+
+
+class ProcessV2FullChainExecutor:
+    def __init__(self) -> None:
+        self._adapter = V2ProcessAdapter(workflow_llm_enabled=True)
+
+    def execute(self, case: V4Case) -> ExecutionPayload:
+        if not case.rag_session_id:
+            raise ValueError(f"Case '{case.case_id}' requires rag_session_id")
+        payload = self._adapter.execute(self._build_case(case), case.rag_session_id)
+        route = dict(payload.details.get("router_result") or {})
+        retrieval_plan = dict(payload.details.get("retrieval_plan") or {})
+        rows = list(payload.details.get("rows") or [])
+        rag_summary = _summarize_rows(rows)
+        pipeline_steps = list(payload.details.get("pipeline_steps") or [])
+        pipeline_summary = {
+            "answer_mode": str(payload.actual.get("answer_mode") or ""),
+            "workflow_llm_enabled": True,
+            "step_count": len(pipeline_steps),
+            "steps": [str(step.get("step") or "") for step in pipeline_steps if str(step.get("step") or "").strip()],
+        }
+        answer = str(payload.details.get("answer") or payload.actual.get("llm_answer") or "")
+        actual = {
+            "domain": payload.actual.get("domain"),
+            "intent": payload.actual.get("intent"),
+            "sub_intent": payload.actual.get("sub_intent"),
+            "profile": retrieval_plan.get("profile"),
+            "layers": list(retrieval_plan.get("layers") or []),
+            "limit": retrieval_plan.get("limit"),
+            "filters": dict(retrieval_plan.get("filters") or {}),
+            "answer_mode": payload.actual.get("answer_mode"),
+            "route": {
+                "routing_domain": route.get("routing_domain"),
+                "intent": route.get("intent"),
+                "subintent": route.get("subintent"),
+                "target_terms": list(route.get("target_terms") or []),
+                "anchors": dict(route.get("anchors") or {}),
+            },
+            "retrieval_plan": {
+                "profile": retrieval_plan.get("profile"),
+                "layers": list(retrieval_plan.get("layers") or []),
+                "limit": retrieval_plan.get("limit"),
+                "filters": dict(retrieval_plan.get("filters") or {}),
+            },
+            "rag": rag_summary,
+            "pipeline": pipeline_summary,
+            "llm": {
+                "answer": answer,
+                "non_empty": bool(answer.strip()),
+                "length": len(answer),
+            },
+        }
+        details = {
+            "query": case.query,
+            "rag_session_id": case.rag_session_id,
+            "route": route,
+            "retrieval_plan": actual["retrieval_plan"],
+            "rag": {
+                **rag_summary,
+                "rows": rows[:20],
+            },
+            "pipeline": pipeline_summary,
+            "answer": answer,
+            "pipeline_steps": pipeline_steps,
+            "logs": list(payload.details.get("logs") or []),
+            "evidence": dict(payload.details.get("evidence") or {}),
+        }
+        return ExecutionPayload(actual=actual, details=details)
+
+    def _build_case(self, case: V4Case) -> V3Case:
+        return V3Case(
+            case_id=case.case_id,
+            runner="process_v2",
+            mode="full_chain",
+            query=case.query,
+            source_file=case.source_file,
+            input=CaseInput(rag_session_id=case.rag_session_id),
+            expectations=CaseExpectations(),
+            notes=case.notes,
+            tags=case.tags,
+        )
+
+
+def _summarize_rows(rows: list[dict]) -> dict[str, object]:
+    paths: list[str] = []
+    layers: list[str] = []
+    metadata_domains: list[str] = []
+    metadata_subdomains: list[str] = []
+    for row in rows:
+        path = str(row.get("path") or "").strip()
+        layer = str(row.get("layer") or "").strip()
+        metadata = dict(row.get("metadata") or {})
+        domain = str(metadata.get("domain") or "").strip()
+        subdomain = str(metadata.get("subdomain") or "").strip()
+        if path and path not in paths:
+            paths.append(path)
+        if layer and layer not in layers:
+            layers.append(layer)
+        if domain and domain not in metadata_domains:
+            metadata_domains.append(domain)
+        if subdomain and subdomain not in metadata_subdomains:
+            metadata_subdomains.append(subdomain)
+    return {
+        "row_count": len(rows),
+        "paths": paths,
+        "layers": layers,
+        "metadata_domains": metadata_domains,
+        "metadata_subdomains": metadata_subdomains,
+    }
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from dataclasses import asdict
+
+from app.core.agent.processes.v2.models import V2RouteAnchors, V2RouteResult
+from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
+
+
+class ProcessV2RetrievalPolicyExecutor:
+    def __init__(self) -> None:
+        self._resolver = V2RetrievalPolicyResolver()
+
+    def execute(self, case: V4Case) -> ExecutionPayload:
+        route = self._build_route(case.route)
+        plan = self._resolver.resolve(route)
+        actual = {
+            "profile": plan.profile,
+            "layers": list(plan.layers),
+            "limit": plan.limit,
+            "filters": dict(plan.filters),
+        }
+        details = {
+            "route": asdict(route),
+            "plan": actual,
+        }
+        return ExecutionPayload(actual=actual, details=details)
+
+    def _build_route(self, raw: dict[str, object]) -> V2RouteResult:
+        anchors_raw = dict(raw.get("anchors") or {})
+        return V2RouteResult(
+            routing_domain=str(raw.get("routing_domain") or ""),
+            intent=str(raw.get("intent") or ""),
+            subintent=str(raw.get("subintent") or ""),
+            user_query=str(raw.get("user_query") or raw.get("normalized_query") or raw.get("name") or "resolver case"),
+            normalized_query=str(raw.get("normalized_query") or raw.get("user_query") or "resolver case"),
+            target_terms=[str(item) for item in raw.get("target_terms") or [] if str(item).strip()],
+            anchors=V2RouteAnchors(
+                entity_names=[str(item) for item in anchors_raw.get("entity_names") or [] if str(item).strip()],
+                file_names=[str(item) for item in anchors_raw.get("file_names") or [] if str(item).strip()],
+                endpoint_paths=[str(item) for item in anchors_raw.get("endpoint_paths") or [] if str(item).strip()],
+                target_doc_hints=[str(item) for item in anchors_raw.get("target_doc_hints") or [] if str(item).strip()],
+                matched_aliases=[str(item) for item in anchors_raw.get("matched_aliases") or [] if str(item).strip()],
+                process_domain=str(anchors_raw.get("process_domain") or "").strip() or None,
+                process_subdomain=str(anchors_raw.get("process_subdomain") or "").strip() or None,
+            ),
+            confidence=float(raw.get("confidence") or 1.0),
+            routing_mode=str(raw.get("routing_mode") or "test_fixture"),
+            llm_router_used=bool(raw.get("llm_router_used") or False),
+            reason_short=str(raw.get("reason_short") or "fixture route"),
+        )
@@ -22,13 +22,23 @@ class _KeywordLlm:
        "где находится",
        "найди файл",
        "найди файлы",
+        "show doc",
+        "show file",
+        "doc for",
+        "file with",
    )
    _DOC_MARKERS = (
        "документац",
        "endpoint",
        "эндпоинт",
        "архитект",
+        "architecture",
+        "overview архитектуры",
+        "arch overview",
        "процесс",
+        "process",
+        "flow",
+        "workflow",
        "сущност",
        "worker",
        "цикл отправки уведомлений",
@@ -43,6 +53,10 @@ class _KeywordLlm:
        "/health",
        "/send",
        "/actions/{action}",
+        "billing invoice process",
+        "billing invoice flow",
+        "billing invoice docs",
+        "notify app",
    )
    _GENERAL_MARKERS = (
        "что это за сервис",
@@ -67,7 +81,7 @@ class _KeywordLlm:
        return json.dumps(route, ensure_ascii=False)

    def _select(self, query: str) -> dict[str, object]:
-        if any(marker in query for marker in self._FILE_MARKERS) or ("дока" in query and "покажи" in query):
+        if any(marker in query for marker in self._FILE_MARKERS) or ("дока" in query and "покажи" in query) or ".md" in query:
            return self._route("DOCS", "DOC_EXPLAIN", "FIND_FILES", "file lookup")
        if any(marker in query for marker in self._GENERAL_MARKERS):
            return self._route("GENERAL", "GENERAL_QA", "SUMMARY", "general overview")
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from dataclasses import asdict
+
+from app.core.agent.processes.v2 import V2IntentRouter
+from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
+from tests.pipeline_setup_v4.executors.process_v2_router_executor import _KeywordLlm
+
+
+class ProcessV2RouterPlusPolicyExecutor:
+    def __init__(self) -> None:
+        self._router = V2IntentRouter(llm=_KeywordLlm(), enable_llm_disambiguation=True)
+        self._resolver = V2RetrievalPolicyResolver()
+
+    def execute(self, case: V4Case) -> ExecutionPayload:
+        route = self._router.route(case.query)
+        plan = self._resolver.resolve(route)
+        route_dump = asdict(route)
+        actual = {
+            "domain": route.routing_domain,
+            "intent": route.intent,
+            "sub_intent": route.subintent,
+            "routing_mode": route.routing_mode,
+            "llm_router_used": route.llm_router_used,
+            "confidence": route.confidence,
+            "profile": plan.profile,
+            "layers": list(plan.layers),
+            "limit": plan.limit,
+            "filters": dict(plan.filters),
+            "route": {
+                "routing_domain": route.routing_domain,
+                "intent": route.intent,
+                "subintent": route.subintent,
+                "target_terms": list(route.target_terms),
+                "anchors": route_dump.get("anchors") or {},
+            },
+            "retrieval_plan": {
+                "profile": plan.profile,
+                "layers": list(plan.layers),
+                "limit": plan.limit,
+                "filters": dict(plan.filters),
+            },
+        }
+        details = {
+            "query": case.query,
+            "route": route_dump,
+            "plan": {
+                "profile": plan.profile,
+                "layers": list(plan.layers),
+                "limit": plan.limit,
+                "filters": dict(plan.filters),
+            },
+            "pipeline_steps": [
+                {
+                    "step": "intent_router",
+                    "input": {"query": case.query},
+                    "output": {
+                        "domain": route.routing_domain,
+                        "intent": route.intent,
+                        "sub_intent": route.subintent,
+                        "reason_short": route.reason_short,
+                        "target_terms": list(route.target_terms),
+                        "anchors": route_dump.get("anchors") or {},
+                    },
+                },
+                {
+                    "step": "retrieval_policy_resolver",
+                    "input": {"route": route_dump},
+                    "output": {
+                        "profile": plan.profile,
+                        "layers": list(plan.layers),
+                        "limit": plan.limit,
+                        "filters": dict(plan.filters),
+                    },
+                },
+            ],
+        }
+        return ExecutionPayload(actual=actual, details=details)
@@ -0,0 +1,94 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import asdict
+
+from app.core.agent.processes.v2 import V2IntentRouter
+from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from app.core.agent.processes.v2.retrieval.v2_rag_adapter import V2RagRetrievalAdapter
+from app.core.rag.persistence.repository import RagRepository
+from app.core.rag.retrieval.session_retriever import RagSessionRetriever
+from tests.pipeline_setup_v3.shared.rag_indexer import DeterministicEmbedder
+from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
+from tests.pipeline_setup_v4.executors.process_v2_router_executor import _KeywordLlm
+
+
+class ProcessV2RouterPlusPolicyRagExecutor:
+    def __init__(self) -> None:
+        self._router = V2IntentRouter(llm=_KeywordLlm(), enable_llm_disambiguation=True)
+        self._resolver = V2RetrievalPolicyResolver()
+        self._adapter = V2RagRetrievalAdapter(RagSessionRetriever(RagRepository(), DeterministicEmbedder()))
+
+    def execute(self, case: V4Case) -> ExecutionPayload:
+        if not case.rag_session_id:
+            raise ValueError(f"Case '{case.case_id}' requires rag_session_id")
+        return asyncio.run(self._execute_async(case))
+
+    async def _execute_async(self, case: V4Case) -> ExecutionPayload:
+        route = self._router.route(case.query)
+        plan = self._resolver.resolve(route)
+        rows = await self._adapter.fetch_rows(case.rag_session_id or "", route.normalized_query, plan)
+        route_dump = asdict(route)
+        rag_summary = _summarize_rows(rows)
+        actual = {
+            "domain": route.routing_domain,
+            "intent": route.intent,
+            "sub_intent": route.subintent,
+            "profile": plan.profile,
+            "layers": list(plan.layers),
+            "limit": plan.limit,
+            "filters": dict(plan.filters),
+            "route": {
+                "routing_domain": route.routing_domain,
+                "intent": route.intent,
+                "subintent": route.subintent,
+                "target_terms": list(route.target_terms),
+                "anchors": route_dump.get("anchors") or {},
+            },
+            "retrieval_plan": {
+                "profile": plan.profile,
+                "layers": list(plan.layers),
+                "limit": plan.limit,
+                "filters": dict(plan.filters),
+            },
+            "rag": rag_summary,
+        }
+        details = {
+            "query": case.query,
+            "rag_session_id": case.rag_session_id,
+            "route": route_dump,
+            "plan": actual["retrieval_plan"],
+            "rag": {
+                **rag_summary,
+                "rows": rows[:20],
+            },
+        }
+        return ExecutionPayload(actual=actual, details=details)
+
+
+def _summarize_rows(rows: list[dict]) -> dict[str, object]:
+    paths: list[str] = []
+    layers: list[str] = []
+    metadata_domains: list[str] = []
+    metadata_subdomains: list[str] = []
+    for row in rows:
+        path = str(row.get("path") or "").strip()
+        layer = str(row.get("layer") or "").strip()
+        metadata = dict(row.get("metadata") or {})
+        domain = str(metadata.get("domain") or "").strip()
+        subdomain = str(metadata.get("subdomain") or "").strip()
+        if path and path not in paths:
+            paths.append(path)
+        if layer and layer not in layers:
+            layers.append(layer)
+        if domain and domain not in metadata_domains:
+            metadata_domains.append(domain)
+        if subdomain and subdomain not in metadata_subdomains:
+            metadata_subdomains.append(subdomain)
+    return {
+        "row_count": len(rows),
+        "paths": paths,
+        "layers": layers,
+        "metadata_domains": metadata_domains,
+        "metadata_subdomains": metadata_subdomains,
+    }
@@ -1,18 +1,56 @@
 from __future__ import annotations

+from tests.pipeline_setup_v4.executors.process_v2_full_chain_executor import ProcessV2FullChainExecutor
+from tests.pipeline_setup_v4.executors.process_v2_retrieval_policy_executor import ProcessV2RetrievalPolicyExecutor
+from tests.pipeline_setup_v4.executors.process_v2_router_plus_policy_executor import ProcessV2RouterPlusPolicyExecutor
+from tests.pipeline_setup_v4.executors.process_v2_router_plus_policy_rag_executor import (
+    ProcessV2RouterPlusPolicyRagExecutor,
+)
 from tests.pipeline_setup_v4.executors.process_v2_router_executor import ProcessV2IntentRouterExecutor


 class ExecutorRegistry:
    def __init__(self) -> None:
        self._router_executor: ProcessV2IntentRouterExecutor | None = None
+        self._policy_executor: ProcessV2RetrievalPolicyExecutor | None = None
+        self._router_plus_policy_executor: ProcessV2RouterPlusPolicyExecutor | None = None
+        self._router_plus_policy_rag_executor: ProcessV2RouterPlusPolicyRagExecutor | None = None
+        self._full_chain_executor: ProcessV2FullChainExecutor | None = None

    def execute(self, component: str, case) -> object:
        if component == "process_v2_intent_router":
            return self._router().execute(case)
+        if component == "process_v2_retrieval_policy_resolver":
+            return self._policy().execute(case)
+        if component == "process_v2_router_plus_retrieval_policy":
+            return self._router_plus_policy().execute(case)
+        if component == "process_v2_router_plus_retrieval_policy_rag":
+            return self._router_plus_policy_rag().execute(case)
+        if component == "process_v2_full_chain":
+            return self._full_chain().execute(case)
        raise ValueError(f"Unsupported component: {component}")

    def _router(self) -> ProcessV2IntentRouterExecutor:
        if self._router_executor is None:
            self._router_executor = ProcessV2IntentRouterExecutor()
        return self._router_executor
+
+    def _policy(self) -> ProcessV2RetrievalPolicyExecutor:
+        if self._policy_executor is None:
+            self._policy_executor = ProcessV2RetrievalPolicyExecutor()
+        return self._policy_executor
+
+    def _router_plus_policy(self) -> ProcessV2RouterPlusPolicyExecutor:
+        if self._router_plus_policy_executor is None:
+            self._router_plus_policy_executor = ProcessV2RouterPlusPolicyExecutor()
+        return self._router_plus_policy_executor
+
+    def _router_plus_policy_rag(self) -> ProcessV2RouterPlusPolicyRagExecutor:
+        if self._router_plus_policy_rag_executor is None:
+            self._router_plus_policy_rag_executor = ProcessV2RouterPlusPolicyRagExecutor()
+        return self._router_plus_policy_rag_executor
+
+    def _full_chain(self) -> ProcessV2FullChainExecutor:
+        if self._full_chain_executor is None:
+            self._full_chain_executor = ProcessV2FullChainExecutor()
+        return self._full_chain_executor