ввв

2026-04-09 15:41:07 +03:00
parent f62fb678b8
commit 2352f91cd3
192 changed files with 6983 additions and 996 deletions
@@ -4,43 +4,42 @@ import asyncio
 from dataclasses import asdict, dataclass
 from pathlib import Path

-from app.core.agent.processes.v2.anchor_signals import route_anchor_summary
-from app.core.agent.processes.v2 import V2IntentRouter
-from app.core.agent.processes.v2.evidence.assembler import DocsEvidenceAssembler
-from app.core.agent.processes.v2.evidence.gate import DocsEvidenceGate
-from app.core.agent.processes.v2.models import RetrievedFile, RetrievedSummary, V2Intent, V2Subintent
-from app.core.agent.processes.v2.retrieval import DocsMetadataLookupIndex
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
-from app.core.agent.processes.v2.retrieval.v2_rag_adapter import V2RagRetrievalAdapter
-from app.core.agent.processes.v2.workflows.docs_explain_find_files.context import DocsExplainFindFilesContext
-from app.core.agent.processes.v2.workflows.docs_explain_find_files.graph import DocsExplainFindFilesGraph
-from app.core.agent.processes.v2.workflows.docs_explain_summary.context import DocsExplainSummaryContext
-from app.core.agent.processes.v2.workflows.docs_explain_summary.graph import DocsExplainSummaryGraph
-from app.core.agent.processes.v2.workflows.general_summary.context import GeneralSummaryContext
-from app.core.agent.processes.v2.workflows.general_summary.graph import GeneralSummaryGraph
+from app.core.agent.processes.v2 import V2IntentRouter, V2Process
 from app.core.agent.utils.llm import AgentLlmService, PromptLoader
-from app.core.rag.embedding.gigachat_embedder import GigaChatEmbedder
 from app.core.rag.persistence import RagRepository
 from app.core.rag.retrieval.session_retriever import RagSessionRetriever
 from app.core.shared.gigachat.client import GigaChatClient
 from app.core.shared.gigachat.settings import GigaChatSettings
 from app.core.shared.gigachat.token_provider import GigaChatTokenProvider
 from app.infra.observability.module_trace import RequestTraceContext
+from app.core.agent.utils.process_v2.anchor_signals import route_anchor_summary
+from app.core.agent.utils.process_v2.evidence.assembler import DocsEvidenceAssembler
+from app.core.agent.utils.process_v2.evidence.gate import DocsEvidenceGate
+from app.core.agent.utils.process_v2.models import V2Intent
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.rag_retrieval import DocsMetadataLookupIndex, V2RagRetrievalAdapter
 from tests.pipeline_setup_v3.core.models import ExecutionPayload, V3Case
+from tests.pipeline_setup_v3.shared.rag_indexer import DeterministicEmbedder
+from tests.pipeline_setup_v4.executors.process_v2_router_executor import _KeywordLlm


 class V2ProcessAdapter:
    def __init__(self, *, workflow_llm_enabled: bool = True) -> None:
        self._workflow_llm_enabled = workflow_llm_enabled
-        self._router = V2IntentRouter(llm=_build_v2_llm())
+        self._llm = _build_v2_llm()
+        self._router = V2IntentRouter(llm=_KeywordLlm(), enable_llm_disambiguation=True)
        self._policy = V2RetrievalPolicyResolver()
-        retriever = RagSessionRetriever(repository=RagRepository(), embedder=GigaChatEmbedder(_build_client()))
+        retriever = RagSessionRetriever(repository=RagRepository(), embedder=DeterministicEmbedder())
        self._retrieval = V2RagRetrievalAdapter(retriever)
-        self._evidence = DocsEvidenceAssembler()
-        self._gate = DocsEvidenceGate()
-        self._summary_graph = DocsExplainSummaryGraph(_build_v2_llm())
-        self._find_files_graph = DocsExplainFindFilesGraph()
-        self._general_graph = GeneralSummaryGraph(_build_v2_llm())
+        self._process = V2Process(
+            llm=self._llm,
+            policy_resolver=self._policy,
+            rag_adapter=self._retrieval,
+            evidence_assembler=DocsEvidenceAssembler(),
+            evidence_gate=DocsEvidenceGate(),
+            router=self._router,
+            workflow_llm_enabled=workflow_llm_enabled,
+        )

    def execute(self, case: V3Case, rag_session_id: str | None) -> ExecutionPayload:
        return asyncio.run(self._execute_async(case, rag_session_id))
@@ -81,6 +80,8 @@ class V2ProcessAdapter:
                actual=_actual_from_v2(route),
                details=_details(case.query, route=route, pipeline_steps=_build_pipeline_steps(runtime.logs)),
            )
+        if case.mode == "full_chain":
+            return await self._execute_full_chain(case, rag_session_id, route)
        plan = self._policy.resolve(route)
        _log_pipeline_step(
            runtime,
@@ -121,26 +122,7 @@ class V2ProcessAdapter:
                actual=_actual_from_v2(route, rows=rows, plan=plan, answer_mode="partial"),
                details=_details(case.query, route=route, plan=plan, rows=rows, pipeline_steps=_build_pipeline_steps(runtime.logs)),
            )
-        answer, evidence, gate = await self._run_workflow(runtime, route, rag_session_id, rows)
-        answer_mode = gate.answer_mode
-        _log_pipeline_step(
-            runtime,
-            "answer_generated",
-            {"answer_mode": answer_mode, "answer_length": len(answer)},
-        )
-        return ExecutionPayload(
-            actual=_actual_from_v2(route, rows=rows, plan=plan, answer=answer, answer_mode=answer_mode),
-            details=_details(
-                case.query,
-                route=route,
-                plan=plan,
-                rows=rows,
-                evidence=evidence,
-                answer=answer,
-                logs=runtime.logs,
-                pipeline_steps=_build_pipeline_steps(runtime.logs),
-            ),
-        )
+        raise ValueError(f"Unsupported process_v2 adapter mode: {case.mode}")

    async def _retrieve_rows(self, route, rag_session_id: str | None, plan) -> list[dict]:
        if not rag_session_id:
@@ -173,125 +155,54 @@ class V2ProcessAdapter:
                merged.append(row)
        return merged

-    async def _run_workflow(
-        self,
-        runtime: "_RuntimeStub",
-        route,
-        rag_session_id: str | None,
-        rows: list[dict],
-    ) -> tuple[str, dict, object]:
-        if route.intent == V2Intent.GENERAL_QA:
-            documents = self._evidence.assemble_summaries(rows, route)
-            gate = self._gate.check_summaries(route, documents)
-            _log_pipeline_step(
-                runtime,
-                "evidence_assembled",
-                {"mode": "summary", "primary_doc": documents[0].path if documents else None, "document_count": len(documents)},
-            )
-            self._log_ranking(runtime, documents)
-            _log_pipeline_step(
-                runtime,
-                "evidence_gate_checked",
-                {"passed": gate.passed, "reason": gate.reason, "answer_mode": gate.answer_mode},
-            )
-            context = GeneralSummaryContext(runtime=runtime, route=route, prompt_name="v2_general.summary_answer")
-            context.workflow_llm_enabled = self._workflow_llm_enabled
-            context.documents = documents
-            context.gate_decision = gate
-            final = await self._general_graph.run(context)
-            return final.answer, {"documents": [_serialize_summary(item) for item in documents], "files": []}, gate
-        if route.subintent == V2Subintent.FIND_FILES:
-            files = self._evidence.assemble_files(rows, route)
-            gate = self._gate.check_files(route, files)
-            _log_pipeline_step(
-                runtime,
-                "evidence_assembled",
-                {"mode": "find_files", "primary_file": files[0].path if files else None, "file_count": len(files)},
-            )
-            self._log_ranking(runtime, files)
-            _log_pipeline_step(
-                runtime,
-                "evidence_gate_checked",
-                {"passed": gate.passed, "reason": gate.reason, "answer_mode": gate.answer_mode},
-            )
-            context = DocsExplainFindFilesContext(
-                runtime=runtime,
-                route=route,
-                rag_session_id=rag_session_id or "",
-                files=files,
-                gate_decision=gate,
-            )
-            final = await self._find_files_graph.run(context)
-            return final.answer, {"documents": [], "files": [_serialize_file(item) for item in files]}, gate
-        documents = self._evidence.assemble_summaries(rows, route)
-        gate = self._gate.check_summaries(route, documents)
-        _log_pipeline_step(
-            runtime,
-            "evidence_assembled",
-            {"mode": "summary", "primary_doc": documents[0].path if documents else None, "document_count": len(documents)},
-        )
-        self._log_ranking(runtime, documents)
-        _log_pipeline_step(
-            runtime,
-            "evidence_gate_checked",
-            {"passed": gate.passed, "reason": gate.reason, "answer_mode": gate.answer_mode},
-        )
-        context = DocsExplainSummaryContext(
-            runtime=runtime,
-            route=route,
-            rag_session_id=rag_session_id or "",
-            prompt_name="v2_docs_explain.summary_answer",
-            workflow_llm_enabled=self._workflow_llm_enabled,
-            documents=documents,
-            gate_decision=gate,
-        )
-        final = await self._summary_graph.run(context)
-        return final.answer, {"documents": [_serialize_summary(item) for item in documents], "files": []}, gate
-
-    def _trace_row(self, row: dict) -> dict[str, object]:
-        metadata = dict(row.get("metadata") or {})
-        return {
-            "path": str(row.get("path") or ""),
-            "layer": str(row.get("layer") or ""),
-            "title": str(row.get("title") or ""),
-            "document_id": str(metadata.get("document_id") or metadata.get("doc_id") or ""),
-        }
-
-    def _log_ranking(self, runtime: "_RuntimeStub", items: list) -> None:
-        top_docs: list[dict[str, object]] = []
-        for item in items[:4]:
-            top_docs.append(
-                {
-                    "doc": getattr(item, "path", ""),
-                    "score": getattr(item, "score", 0),
-                    "match_reason": getattr(item, "match_reason", ""),
-                }
-            )
-            _log_pipeline_step(
-                runtime,
-                "ranking_explained",
-                {
-                    "doc": getattr(item, "path", ""),
-                    "score": getattr(item, "score", 0),
-                    "score_breakdown": getattr(item, "score_breakdown", {}),
-                    "match_reason": getattr(item, "match_reason", ""),
-                },
-            )
-        _log_pipeline_step(
-            runtime,
-            "ranking_explained",
-            {
-                "top_docs_after_ranking": top_docs,
-                "ranking_score_breakdown": [
-                    {
-                        "doc": getattr(item, "path", ""),
-                        "score_breakdown": getattr(item, "score_breakdown", {}),
-                    }
-                    for item in items[:4]
-                ],
+    async def _execute_full_chain(self, case: V3Case, rag_session_id: str | None, route) -> ExecutionPayload:
+        runtime = _RuntimeStub(query=case.query, rag_session_id=rag_session_id)
+        result = await self._process.run(runtime)
+        retrieval_plan = _event_payload(runtime.logs, "process.v2.retrieval_policy", "retrieval_plan_resolved")
+        rows = list(_event_payload(runtime.logs, "process.v2.rag_retrieval", "rag_rows_fetched").get("rows") or [])
+        answer_generated = _event_payload(runtime.logs, "process.v2.pipeline", "answer_generated")
+        return ExecutionPayload(
+            actual={
+                "domain": route.routing_domain,
+                "intent": route.intent,
+                "sub_intent": route.subintent,
+                "rag_count": len(rows),
+                "llm_answer": result.answer,
+                "answer_mode": str(answer_generated.get("answer_mode") or ""),
+                "path_scope": tuple(),
+                "symbol_candidates": tuple(),
+                "entity_candidates": tuple(_entity_candidates(rows)),
+                "doc_scope": tuple(_doc_scope(rows)),
+                "layers": tuple(retrieval_plan.get("layers") or []),
+                "filters": dict(retrieval_plan.get("filters") or {}),
+            },
+            details={
+                "query": case.query,
+                "router_result": asdict(route),
+                "retrieval_plan": retrieval_plan,
+                "rows": rows,
+                "answer": result.answer,
+                "logs": runtime.logs,
+                "pipeline_steps": _build_pipeline_steps(runtime.logs),
            },
        )

+    def _trace_row(self, row: dict) -> dict[str, object]:
+        metadata = row.get("metadata") or {}
+        content = str(row.get("content") or "").strip()
+        return {
+            "layer": str(row.get("layer") or ""),
+            "path": str(row.get("path") or ""),
+            "title": str(row.get("title") or ""),
+            "document_id": str(metadata.get("document_id") or metadata.get("doc_id") or row.get("document_id") or ""),
+            "entity_name": str(metadata.get("entity_name") or ""),
+            "summary_text": str(metadata.get("summary_text") or "")[:400],
+            "section_path": str(metadata.get("section_path") or ""),
+            "metadata_domain": str(metadata.get("domain") or ""),
+            "metadata_subdomain": str(metadata.get("subdomain") or ""),
+            "content_preview": content[:400],
+        }
+

@dataclass(slots=True)
 class _RequestStub:
@@ -320,10 +231,10 @@ class _TraceLoggerStub:


 class _RuntimeStub:
-    def __init__(self, *, query: str) -> None:
+    def __init__(self, *, query: str, rag_session_id: str | None = None) -> None:
        self.logs: list[dict] = []
        self.request = _RequestStub(request_id="pipeline_setup_v3", message=query)
-        self.session = _SessionStub()
+        self.session = _SessionStub(active_rag_session_id=rag_session_id)
        self.publisher = _PublisherStub()
        self.trace = RequestTraceContext(request_id=self.request.request_id, logger=_TraceLoggerStub(self.logs))

@@ -335,8 +246,10 @@ def _build_client() -> GigaChatClient:

 def _build_v2_llm() -> AgentLlmService:
    prompt_paths = [
-        Path(__file__).resolve().parents[3] / "src/app/core/agent/processes/v2/prompts.yml",
-        Path(__file__).resolve().parents[3] / "src/app/core/agent/processes/v2/general_prompts.yml",
+        Path(__file__).resolve().parents[3]
+        / "src/app/core/agent/processes/v2/workflows/doc_explain_summary/steps/prompts/prompts.yml",
+        Path(__file__).resolve().parents[3]
+        / "src/app/core/agent/processes/v2/workflows/general_qa_summary/steps/prompts/prompts.yml",
        Path(__file__).resolve().parents[3] / "src/app/core/agent/processes/v2/intent_router/routers/prompts.yml",
    ]
    return AgentLlmService(client=_build_client(), prompts=PromptLoader(prompt_paths))
@@ -375,7 +288,12 @@ def _doc_scope(rows: list[dict]) -> list[str]:
    values: list[str] = []
    for row in rows:
        metadata = dict(row.get("metadata") or {})
-        for candidate in (metadata.get("document_id"), metadata.get("doc_id"), row.get("path")):
+        for candidate in (
+            row.get("document_id"),
+            metadata.get("document_id"),
+            metadata.get("doc_id"),
+            row.get("path"),
+        ):
            value = str(candidate or "").strip()
            if value and value not in values:
                values.append(value)
@@ -386,20 +304,12 @@ def _entity_candidates(rows: list[dict]) -> list[str]:
    values: list[str] = []
    for row in rows:
        metadata = dict(row.get("metadata") or {})
-        value = str(metadata.get("entity_name") or row.get("title") or "").strip()
+        value = str(row.get("entity_name") or metadata.get("entity_name") or row.get("title") or "").strip()
        if value and value not in values and str(row.get("layer") or "") == "D3_ENTITY_CATALOG":
            values.append(value)
    return values


-def _serialize_summary(item: RetrievedSummary) -> dict:
-    return asdict(item)
-
-
-def _serialize_file(item: RetrievedFile) -> dict:
-    return asdict(item)
-
-
 def _build_pipeline_steps(logs: list[dict]) -> list[dict]:
    steps: list[dict] = []
    for item in logs:
@@ -409,6 +319,16 @@ def _build_pipeline_steps(logs: list[dict]) -> list[dict]:
    return steps


+def _event_payload(logs: list[dict], module: str, event: str) -> dict[str, object]:
+    for item in logs:
+        if item.get("module") == module and item.get("event") == event:
+            payload = item.get("payload") or {}
+            if isinstance(payload, dict):
+                return dict(payload)
+            return {}
+    return {}
+
+
 def _log_pipeline_step(runtime: _RuntimeStub, step: str, payload: dict[str, object]) -> None:
    runtime.logs.append(
        {
@@ -14,7 +14,7 @@ from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case

 class ProcessV2FullChainExecutor:
    def __init__(self) -> None:
-        self._adapter = V2ProcessAdapter(workflow_llm_enabled=True)
+        self._adapter = V2ProcessAdapter(workflow_llm_enabled=False)

    def execute(self, case: V4Case) -> ExecutionPayload:
        if not case.rag_session_id:
@@ -27,7 +27,7 @@ class ProcessV2FullChainExecutor:
        pipeline_steps = list(payload.details.get("pipeline_steps") or [])
        pipeline_summary = {
            "answer_mode": str(payload.actual.get("answer_mode") or ""),
-            "workflow_llm_enabled": True,
+            "workflow_llm_enabled": False,
            "step_count": len(pipeline_steps),
            "steps": [str(step.get("step") or "") for step in pipeline_steps if str(step.get("step") or "").strip()],
        }
@@ -102,8 +102,8 @@ def _summarize_rows(rows: list[dict]) -> dict[str, object]:
        path = str(row.get("path") or "").strip()
        layer = str(row.get("layer") or "").strip()
        metadata = dict(row.get("metadata") or {})
-        domain = str(metadata.get("domain") or "").strip()
-        subdomain = str(metadata.get("subdomain") or "").strip()
+        domain = str(row.get("metadata_domain") or metadata.get("domain") or "").strip()
+        subdomain = str(row.get("metadata_subdomain") or metadata.get("subdomain") or "").strip()
        if path and path not in paths:
            paths.append(path)
        if layer and layer not in layers:
@@ -2,8 +2,8 @@ from __future__ import annotations

 from dataclasses import asdict

-from app.core.agent.processes.v2.models import V2RouteAnchors, V2RouteResult
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.models import V2RouteAnchors, V2RouteResult
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver
 from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case


@@ -3,7 +3,7 @@ from __future__ import annotations
 from dataclasses import asdict

 from app.core.agent.processes.v2 import V2IntentRouter
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver
 from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
 from tests.pipeline_setup_v4.executors.process_v2_router_executor import _KeywordLlm

@@ -4,10 +4,10 @@ import asyncio
 from dataclasses import asdict

 from app.core.agent.processes.v2 import V2IntentRouter
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
-from app.core.agent.processes.v2.retrieval.v2_rag_adapter import V2RagRetrievalAdapter
 from app.core.rag.persistence.repository import RagRepository
 from app.core.rag.retrieval.session_retriever import RagSessionRetriever
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.rag_retrieval import V2RagRetrievalAdapter
 from tests.pipeline_setup_v3.shared.rag_indexer import DeterministicEmbedder
 from tests.pipeline_setup_v4.core.models import ExecutionPayload, V4Case
 from tests.pipeline_setup_v4.executors.process_v2_router_executor import _KeywordLlm
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from app.core.agent.processes.v2.workflows.doc_explain_api_exposed.steps.retrieval.api_endpoint_collector import (
+    ApiEndpointCollector,
+)
+
+
+def test_collector_returns_method_and_path_lines() -> None:
+    rows = [
+        {
+            "metadata": {
+                "endpoint": "GET|POST /actions/{action}",
+                "summary_text": "Endpoint for runtime control actions",
+            },
+            "title": "HTTP API /actions/{action}",
+            "path": "docs/api/control-actions-endpoint.md",
+            "content": "",
+        },
+        {
+            "metadata": {
+                "endpoint": "GET /health",
+            },
+            "title": "HTTP API /health",
+            "path": "docs/api/health-endpoint.md",
+            "content": "",
+        },
+    ]
+
+    endpoints = ApiEndpointCollector().collect(rows)
+
+    assert "GET /actions/{action}" in endpoints
+    assert "POST /actions/{action}" in endpoints
+    assert "GET /health" in endpoints
+
+
+def test_collector_ignores_file_paths_from_content() -> None:
+    rows = [
+        {
+            "metadata": {
+                "endpoint": "GET /health",
+                "summary_text": "Uses src/telegram_notify_app/control_api.py",
+            },
+            "title": "Health endpoint",
+            "path": "docs/api/health-endpoint.md",
+            "content": "See /telegram_notify_app/control_api.py and /telegram_notify_app/worker.py",
+        }
+    ]
+
+    endpoints = ApiEndpointCollector().collect(rows)
+
+    assert endpoints == ["GET /health"]
+
+
+def test_collector_uses_title_path_fallback_when_endpoint_metadata_missing() -> None:
+    rows = [
+        {
+            "metadata": {
+                "summary_text": "Control actions endpoint",
+            },
+            "title": "HTTP API /actions/{action}",
+            "path": "docs/api/control-actions-endpoint.md",
+            "content": "",
+        }
+    ]
+
+    endpoints = ApiEndpointCollector().collect(rows)
+
+    assert endpoints == ["GET /actions/{action}"]
@@ -1,7 +1,7 @@
 from __future__ import annotations

-from app.core.agent.processes.v2.evidence.assembler import DocsEvidenceAssembler
-from app.core.agent.processes.v2.models import V2Domain, V2Intent, V2RouteAnchors, V2RouteResult, V2Subintent
+from app.core.agent.utils.process_v2.evidence.assembler import DocsEvidenceAssembler
+from app.core.agent.utils.process_v2.models import V2Domain, V2Intent, V2RouteAnchors, V2RouteResult, V2Subintent


 def _route(*, hints: list[str], terms: list[str], subintent: str = V2Subintent.SUMMARY) -> V2RouteResult:
@@ -4,11 +4,11 @@ import asyncio
 from dataclasses import dataclass

 from app.core.agent.processes.v2 import V2IntentRouter, V2Process
-from app.core.agent.processes.v2.retrieval.target_doc_seeding import normalize_doc_path
-from app.core.agent.processes.v2.evidence.assembler import DocsEvidenceAssembler
-from app.core.agent.processes.v2.evidence.gate import DocsEvidenceGate
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
 from app.core.agent.runtime.execution_context import RuntimeExecutionContext
+from app.core.agent.utils.process_v2.evidence.assembler import DocsEvidenceAssembler
+from app.core.agent.utils.process_v2.evidence.gate import DocsEvidenceGate
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.rag_retrieval.target_doc_seeding import normalize_doc_path
 from app.core.api.domain.models.agent_request import AgentRequest
 from app.core.api.domain.models.agent_session import AgentSession
 from app.schemas.orchestration import RequestExecutionStatus
@@ -2,8 +2,8 @@ from __future__ import annotations

 import asyncio

-from app.core.agent.processes.v2.retrieval.v2_rag_adapter import V2RagRetrievalAdapter
 from app.core.rag.retrieval.session_retriever import RetrievalPlan
+from app.core.agent.utils.process_v2.rag_retrieval import V2RagRetrievalAdapter


 class FakeRetriever:
@@ -50,6 +50,24 @@ class FakeRetriever:
        self.calls.append(("substring", list(path_needles)))
        return []

+    async def list_docs_scope_rows(self, _rag_session_id: str, *, limit: int = 8000) -> list[dict]:
+        del limit
+        self.calls.append(("scope_rows", None))
+        return [
+            {
+                "path": "docs/api/health-endpoint.md",
+                "layer": "D1_DOCUMENT_CATALOG",
+                "title": "HTTP API /health",
+                "metadata": {"type": "api_method", "endpoint": "GET /health"},
+            },
+            {
+                "path": "docs/api/send-message-endpoint.md",
+                "layer": "D1_DOCUMENT_CATALOG",
+                "title": "HTTP API /send",
+                "metadata": {"type": "api_method", "endpoint": "GET /send"},
+            },
+        ]
+

 def test_v2_rag_adapter_seeds_exact_rows_from_plan_hints() -> None:
    adapter = V2RagRetrievalAdapter(FakeRetriever())
@@ -79,3 +97,21 @@ def test_v2_rag_adapter_uses_substring_fallback_for_missing_hint() -> None:
    asyncio.run(adapter.fetch_rows("rag-1", "find file", plan))

    assert ("substring", ["missing-health-endpoint.md"]) in retriever.calls
+
+
+def test_v2_rag_adapter_applies_query_signal_filter_for_api_exposed() -> None:
+    adapter = V2RagRetrievalAdapter(FakeRetriever())
+    plan = RetrievalPlan(
+        profile="api_exposed",
+        layers=["D1_DOCUMENT_CATALOG"],
+        limit=50,
+        filters={
+            "metadata.type": "api_method",
+            "query_signals": ["health"],
+        },
+    )
+
+    rows = asyncio.run(adapter.fetch_rows("rag-1", "health endpoints", plan))
+
+    assert len(rows) == 1
+    assert rows[0]["path"] == "docs/api/health-endpoint.md"
@@ -1,7 +1,7 @@
 from __future__ import annotations

-from app.core.agent.processes.v2.models import V2Domain, V2Intent, V2RouteAnchors, V2RouteResult, V2Subintent
-from app.core.agent.processes.v2.retrieval.policy_resolver import V2RetrievalPolicyResolver
+from app.core.agent.utils.process_v2.models import V2Domain, V2Intent, V2RouteAnchors, V2RouteResult, V2Subintent
+from app.core.agent.utils.process_v2.plan_resolver import V2RetrievalPolicyResolver


 def _route(
@@ -0,0 +1,138 @@
+"""Tests for pre-LLM scope grounding from D1/D3 catalog rows (no extra RAG layer)."""
+
+from __future__ import annotations
+
+import json
+
+from app.core.agent.processes.v2 import V2IntentRouter
+from app.core.agent.utils.process_v2.models import V2ScopeType
+
+
+class FakeLlm:
+    def __init__(self, response: str) -> None:
+        self.response = response
+
+    def generate(self, prompt_name: str, user_input: str, **_kwargs) -> str:
+        del prompt_name, user_input
+        return self.response
+
+
+def _llm_ok() -> str:
+    return json.dumps(
+        {
+            "routing_domain": "DOCS",
+            "intent": "DOC_EXPLAIN",
+            "subintent": "SUMMARY",
+            "confidence": 0.9,
+            "reason_short": "ok",
+        },
+        ensure_ascii=False,
+    )
+
+
+def _fixture_rows() -> list[dict]:
+    return [
+        {
+            "layer": "D1_DOCUMENT_CATALOG",
+            "path": "docs/billing/overview.md",
+            "title": "Billing",
+            "content": "",
+            "metadata": {"domain": "billing", "summary_text": "Billing domain overview"},
+        },
+        {
+            "layer": "D1_DOCUMENT_CATALOG",
+            "path": "docs/billing/invoices.md",
+            "title": "Invoices",
+            "content": "",
+            "metadata": {"domain": "billing", "subdomain": "invoice", "tags": ["invoice", "invoices"]},
+        },
+        {
+            "layer": "D3_ENTITY_CATALOG",
+            "path": "docs/domains/order.md",
+            "title": "Order",
+            "content": "",
+            "metadata": {"entity_name": "Order", "domain": "billing"},
+        },
+        {
+            "layer": "D1_DOCUMENT_CATALOG",
+            "path": "docs/api/invoices_post.md",
+            "title": "POST /api/v1/invoices",
+            "content": "",
+            "metadata": {
+                "doc_type": "api_method",
+                "domain": "billing",
+                "endpoint": "/api/v1/invoices",
+            },
+        },
+        {
+            "layer": "D1_DOCUMENT_CATALOG",
+            "path": "docs/widgets/readme.md",
+            "title": "Widgets",
+            "content": "",
+            "metadata": {"domain": "widgets", "summary_text": "Unrelated domain for negative tests"},
+        },
+    ]
+
+
+def _router() -> V2IntentRouter:
+    return V2IntentRouter(llm=FakeLlm(_llm_ok()), scope_rows_provider=lambda _sid: _fixture_rows())
+
+
+def test_scope_global_project_wide_enumeration() -> None:
+    r = _router().route("какие api методы есть в проекте", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.GLOBAL
+
+
+def test_scope_domain_billing() -> None:
+    r = _router().route("какие api есть в billing", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.DOMAIN
+    assert r.anchors.process_domain == "billing"
+    assert any(c.value == "billing" for c in r.anchors.candidate_domains)
+
+
+def test_scope_subdomain_billing_invoices() -> None:
+    r = _router().route("какие api есть в billing invoices", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.SUBDOMAIN
+    assert r.anchors.process_domain == "billing"
+    assert r.anchors.process_subdomain == "invoice"
+
+
+def test_scope_entity_order_doc() -> None:
+    r = _router().route("дай доку по Order", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.ENTITY
+    assert "order" in [e.lower() for e in r.anchors.entity_names]
+
+
+def test_scope_entity_endpoint_path() -> None:
+    r = _router().route("где описан POST /api/v1/invoices", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.ENTITY
+    assert "/api/v1/invoices" in r.anchors.endpoint_paths
+
+
+def test_scope_vague_no_false_domain() -> None:
+    r = _router().route("что там с фывырапфыв", rag_session_id="sess-1")
+    assert r.scope_type == V2ScopeType.UNKNOWN
+    assert r.anchors.process_domain is None
+
+
+def test_scope_russian_payments_phrase_matches_tag() -> None:
+    rows = [
+        *_fixture_rows(),
+        {
+            "layer": "D1_DOCUMENT_CATALOG",
+            "path": "docs/billing/payments_ru.md",
+            "title": "Платежи",
+            "content": "",
+            "metadata": {"domain": "billing", "tags": ["платежи"]},
+        },
+    ]
+    router = V2IntentRouter(llm=FakeLlm(_llm_ok()), scope_rows_provider=lambda _sid: rows)
+    r = router.route("какие методы есть в платежи", rag_session_id="sess-1")
+    assert r.scope_type in {V2ScopeType.DOMAIN, V2ScopeType.ENTITY, V2ScopeType.SUBDOMAIN}
+    assert r.anchors.process_domain == "billing" or any("платеж" in c.value for c in r.anchors.candidate_entities)
+
+
+def test_router_without_session_skips_db_and_keeps_target_terms() -> None:
+    r = V2IntentRouter(llm=FakeLlm(_llm_ok())).route("Покажи где описан RuntimeHealth и /health")
+    assert r.scope_type == V2ScopeType.UNKNOWN
+    assert "runtimehealth" in r.target_terms
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import hashlib
+
+from app.core.api.application.filesystem_snapshot_resolver import FilesystemSnapshotResolver
+
+
+def test_augment_adds_untracked_files_from_filesystem(tmp_path) -> None:
+    root = tmp_path / "repo"
+    root.mkdir()
+    docs = root / "docs" / "api"
+    docs.mkdir(parents=True)
+    control_path = docs / "control-actions-endpoint.md"
+    control_content = "---\ndoc_type: api_method\n---\nGET|POST /actions/{action}\n"
+    control_path.write_text(control_content, encoding="utf-8")
+
+    incoming = [
+        {
+            "path": "docs/api/health-endpoint.md",
+            "content": "health",
+            "content_hash": hashlib.sha256(b"health").hexdigest(),
+        }
+    ]
+
+    out = FilesystemSnapshotResolver().augment(project_id=str(root), files=incoming)
+    paths = {item["path"] for item in out}
+    assert "docs/api/health-endpoint.md" in paths
+    assert "docs/api/control-actions-endpoint.md" in paths
+
+
+def test_augment_prefers_request_payload_for_existing_path(tmp_path) -> None:
+    root = tmp_path / "repo"
+    root.mkdir()
+    docs = root / "docs" / "api"
+    docs.mkdir(parents=True)
+    file_path = docs / "health-endpoint.md"
+    file_path.write_text("from-disk", encoding="utf-8")
+
+    incoming_content = "from-request"
+    incoming = [
+        {
+            "path": "docs/api/health-endpoint.md",
+            "content": incoming_content,
+            "content_hash": hashlib.sha256(incoming_content.encode("utf-8")).hexdigest(),
+        }
+    ]
+
+    out = FilesystemSnapshotResolver().augment(project_id=str(root), files=incoming)
+    by_path = {item["path"]: item for item in out}
+    assert by_path["docs/api/health-endpoint.md"]["content"] == incoming_content
+
+
+def test_augment_ignores_files_outside_root_docs(tmp_path) -> None:
+    root = tmp_path / "repo"
+    root.mkdir()
+    (root / "docs").mkdir()
+    (root / "docs" / "README.md").write_text("docs", encoding="utf-8")
+    (root / "src").mkdir()
+    (root / "src" / "app.py").write_text("print('x')", encoding="utf-8")
+
+    out = FilesystemSnapshotResolver().augment(project_id=str(root), files=[])
+    paths = {item["path"] for item in out}
+    assert "docs/README.md" in paths
+    assert "src/app.py" not in paths
+
+
+def test_augment_keeps_docs_when_request_uses_absolute_paths(tmp_path) -> None:
+    root = tmp_path / "repo"
+    root.mkdir()
+    (root / "docs" / "api").mkdir(parents=True)
+    (root / "docs" / "api" / "health-endpoint.md").write_text("disk", encoding="utf-8")
+
+    absolute_docs = str(root / "docs" / "api" / "control-actions-endpoint.md")
+    absolute_src = str(root / "src" / "app.py")
+    incoming = [
+        {"path": absolute_docs, "content": "req-doc", "content_hash": hashlib.sha256(b"req-doc").hexdigest()},
+        {"path": absolute_src, "content": "req-src", "content_hash": hashlib.sha256(b"req-src").hexdigest()},
+    ]
+
+    out = FilesystemSnapshotResolver().augment(project_id=str(root), files=incoming)
+    paths = {item["path"] for item in out}
+    assert "docs/api/control-actions-endpoint.md" in paths
+    assert "src/app.py" not in paths
@@ -304,3 +304,34 @@ Read health
    assert RagLayer.DOCS_INTEGRATION_INDEX in layers
    assert "docs integration parse warning" in caplog.text
    assert all(doc.source.path == "docs/api/health-endpoint.md" for doc in docs)
+
+
+def test_docs_pipeline_tolerates_broken_frontmatter_and_keeps_api_type() -> None:
+    pipeline = DocsIndexingPipeline()
+    content = """---
+id: api.control_actions_endpoint
+type: api_method
+doc_type: api_method
+title: HTTP API /actions/{action}
+endpoint: GET|POST /actions/{action}
+links:
+  called_by:
+    - ext.operator
+  tags:
+- api
+---
+# HTTP API /actions/{action}
+
+## Summary
+
+Control actions endpoint.
+"""
+    docs = pipeline.index_file(
+        repo_id="acme/proj",
+        commit_sha="abc123",
+        path="docs/api/control-actions-endpoint.md",
+        content=content,
+    )
+    catalog = next(doc for doc in docs if doc.layer == RagLayer.DOCS_DOCUMENT_CATALOG)
+    assert catalog.metadata["type"] == "api_method"
+    assert catalog.metadata["title"] == "HTTP API /actions/{action}"