фиксирую состояние

This commit is contained in:
2026-04-07 21:41:27 +03:00
parent bc29d51a29
commit 8fb76bb331
56 changed files with 7011 additions and 316 deletions
@@ -4,17 +4,17 @@ from app.core.agent.processes.v2.models import V2AnchorType, V2RouteAnchors, V2R
def anchor_signal_types(route: V2RouteResult) -> set[str]:
hints = [str(item).strip().lower() for item in route.anchors.target_doc_hints if str(item or "").strip()]
texts = _signal_texts(route)
signals: set[str] = set()
if route.subintent == V2Subintent.FIND_FILES:
signals.add(V2AnchorType.FIND_FILES)
if route.anchors.endpoint_paths or _has_hint(hints, "/api/"):
if route.anchors.endpoint_paths or _has_any(texts, ("/api/", "api", "endpoint")):
signals.add(V2AnchorType.API_ENDPOINT)
if _has_hint(hints, "/architecture/"):
if _has_any(texts, ("/architecture/", "architecture", "arch")):
signals.add(V2AnchorType.ARCHITECTURE)
if _has_hint(hints, "/logic/"):
if _has_any(texts, ("/logic/", "logic", "workflow", "flow", "process")):
signals.add(V2AnchorType.LOGIC_FLOW)
if _has_hint(hints, "/domains/"):
if route.anchors.entity_names or _has_any(texts, ("/domains/", "domain", "entity", "component")):
signals.add(V2AnchorType.DOMAIN_ENTITY)
return signals
@@ -44,5 +44,14 @@ def anchors_have_signal(anchors: V2RouteAnchors, signal: str, *, subintent: str
return signal in anchor_signal_types(route)
def _has_hint(hints: list[str], marker: str) -> bool:
return any(marker in hint for hint in hints)
def _signal_texts(route: V2RouteResult) -> list[str]:
items = [
*route.anchors.target_doc_hints,
*route.anchors.file_names,
*route.anchors.matched_aliases,
]
return [str(item).strip().lower() for item in items if str(item or "").strip()]
def _has_any(items: list[str], markers: tuple[str, ...]) -> bool:
return any(marker in item for item in items for marker in markers)
@@ -11,6 +11,8 @@ from app.core.rag.contracts.enums import RagLayer
class DocsEvidenceAssembler:
_API_PATH_PREFIXES = ("docs/api/", "docs/endpoints/", "docs/methods/", "api/", "endpoints/", "methods/")
_GENERIC_DOC_MARKERS = ("readme", "overview", "index", "navigation", "related docs", "catalog")
def assemble_summaries(self, rows: list[dict], route: V2RouteResult) -> list[RetrievedSummary]:
items = self._rank_rows(rows, route, mode="summary")
ranked = [
@@ -71,10 +73,12 @@ class DocsEvidenceAssembler:
"score": score,
"score_breakdown": breakdown,
"match_reason": self._match_reason(breakdown),
"is_generic_doc": self._is_generic_doc(path, self._title(row, path), self._summary(row), row),
}
)
ranked.sort(key=lambda item: (-item["score"], item["path"]))
return self._ensure_target_docs_in_top_k(ranked, route, k=4 if mode == "find_files" else 3)
ranked = self._ensure_target_docs_in_top_k(ranked, route, k=4 if mode == "find_files" else 3)
return self._promote_specific_primary(ranked, route)
def _score_breakdown(self, row: dict, route: V2RouteResult, *, mode: str) -> dict[str, int]:
path_raw = self._path(row)
@@ -93,6 +97,7 @@ class DocsEvidenceAssembler:
"alias_match": 0,
"anchor_boost": 0,
"target_doc_boost": 0,
"specificity_boost": 0,
"generic_penalty": 0,
}
if route.intent == "GENERAL_QA":
@@ -100,6 +105,7 @@ class DocsEvidenceAssembler:
hint_norm_lower = {normalize_doc_path(h).lower() for h in route.anchors.target_doc_hints if str(h or "").strip()}
if normalize_doc_path(path_raw).lower() in hint_norm_lower:
breakdown["target_doc_boost"] += 1000
hint_texts = [str(hint or "").strip().lower() for hint in route.anchors.target_doc_hints if str(hint or "").strip()]
if any(alias.lower() in " ".join([path, title, summary, entity]) for alias in route.anchors.matched_aliases):
breakdown["alias_match"] += 500
for token in query_tokens:
@@ -111,10 +117,25 @@ class DocsEvidenceAssembler:
breakdown["semantic"] += 20
if self._compact(token) in compact_haystack:
breakdown["alias_match"] += 250
for hint in hint_texts:
compact_hint = self._compact(hint)
if compact_hint and compact_hint in compact_haystack:
breakdown["target_doc_boost"] += 180
elif hint and hint.strip("/") in " ".join([path, title, summary, entity]):
breakdown["semantic"] += 70
endpoint_text = self._summary(row).lower()
for endpoint in route.anchors.endpoint_paths:
normalized_endpoint = endpoint.strip().lower()
endpoint_slug = normalized_endpoint.strip("/")
if normalized_endpoint and normalized_endpoint in endpoint_text:
breakdown["target_doc_boost"] += 260
if endpoint_slug and endpoint_slug in filename:
breakdown["filename_match"] += 200
if any(endpoint.strip("/").lower() in filename for endpoint in route.anchors.endpoint_paths):
breakdown["filename_match"] += 200
signals = anchor_signal_types(route)
breakdown["anchor_boost"] += self._anchor_boost(path, signals)
breakdown["specificity_boost"] += self._specificity_boost(row, path, title, summary, route)
breakdown["generic_penalty"] += self._generic_penalty(path, signals)
if mode == "find_files":
breakdown["path_match"] *= 3
@@ -125,8 +146,8 @@ class DocsEvidenceAssembler:
def _anchor_boost(self, path: str, signals: set[str]) -> int:
boost = 0
if V2AnchorType.API_ENDPOINT in signals and path.startswith("docs/api/"):
boost += 300
if V2AnchorType.API_ENDPOINT in signals and path.startswith(self._API_PATH_PREFIXES):
boost += 360
if V2AnchorType.LOGIC_FLOW in signals and path.startswith("docs/logic/"):
boost += 300
if V2AnchorType.DOMAIN_ENTITY in signals and path.startswith("docs/domains/"):
@@ -139,8 +160,11 @@ class DocsEvidenceAssembler:
def _generic_penalty(self, path: str, signals: set[str]) -> int:
penalty = 0
lowered = path.lower()
if path == "docs/README.md" and V2AnchorType.ARCHITECTURE not in signals:
penalty -= 200
penalty -= 260
if any(marker in lowered for marker in ("/readme", "readme.md", "/index", "/overview", "/catalog", "/navigation")):
penalty -= 220
if "/architecture/" in path and V2AnchorType.ARCHITECTURE not in signals and signals.intersection(
{V2AnchorType.API_ENDPOINT, V2AnchorType.DOMAIN_ENTITY}
):
@@ -173,6 +197,17 @@ class DocsEvidenceAssembler:
top.sort(key=lambda item: (-item["score"], item["path"]))
return top + remaining
def _promote_specific_primary(self, ranked: list[dict], route: V2RouteResult) -> list[dict]:
if len(ranked) < 2:
return ranked
first = ranked[0]
if not first.get("is_generic_doc"):
return ranked
promoted = next((item for item in ranked[1:] if not item.get("is_generic_doc") and self._is_specific_candidate(item, route)), None)
if promoted is None:
return ranked
return [promoted] + [item for item in ranked if item["path"] != promoted["path"]]
def _match_reason(self, breakdown: dict[str, int]) -> str:
if breakdown["target_doc_boost"] > 0:
return "exact_path"
@@ -189,6 +224,53 @@ class DocsEvidenceAssembler:
section = str(metadata.get("section_path") or "").lower()
return "summary" in section or "свод" in section or "overview" in section
def _specificity_boost(self, row: dict, path: str, title: str, summary: str, route: V2RouteResult) -> int:
boost = 0
filename = path.split("/")[-1]
lowered_title = title.lower()
lowered_summary = summary.lower()
if not self._is_generic_doc(path, title, summary, row):
boost += 90
if path.startswith(self._API_PATH_PREFIXES):
boost += 160
if "endpoint" in filename or "endpoint" in lowered_title or "method" in lowered_title:
boost += 120
if row.get("layer") == RagLayer.DOCS_DOC_CHUNKS and not self._looks_like_navigation_chunk(row):
boost += 80
for token in self._query_tokens(route):
if token and token in filename:
boost += 90
if token and token in lowered_title:
boost += 70
if token and token in lowered_summary:
boost += 40
return boost
def _is_specific_candidate(self, item: dict, route: V2RouteResult) -> bool:
breakdown = dict(item.get("score_breakdown") or {})
if breakdown.get("target_doc_boost", 0) > 0:
return True
if breakdown.get("specificity_boost", 0) >= 160:
return True
return V2AnchorType.API_ENDPOINT in anchor_signal_types(route) and item["path"].startswith(self._API_PATH_PREFIXES)
def _is_generic_doc(self, path: str, title: str, summary: str, row: dict) -> bool:
haystack = " ".join([path.lower(), title.lower(), summary.lower()])
if any(marker in haystack for marker in self._GENERIC_DOC_MARKERS):
return True
return self._looks_like_navigation_chunk(row)
def _looks_like_navigation_chunk(self, row: dict) -> bool:
text = self._summary(row).lower()
if not text:
return False
lines = [line.strip() for line in text.splitlines() if line.strip()]
bullet_lines = sum(1 for line in lines if line.startswith(("- ", "* ", "1.", "2.", "3.")))
link_lines = sum(1 for line in lines if "](" in line or line.startswith("docs/"))
if "related docs" in text or "navigation" in text:
return True
return bullet_lines >= 3 or link_lines >= 3
def _query_tokens(self, route: V2RouteResult) -> list[str]:
values = list(route.target_terms) + list(route.anchors.matched_aliases)
tokens: list[str] = []
@@ -8,6 +8,7 @@ class QueryFeatures:
normalized_query: str
target_terms: list[str]
endpoint_paths: list[str]
file_names: list[str]
matched_aliases: list[str]
target_doc_hints: list[str]
file_markers: list[str]
@@ -34,10 +34,42 @@ class _MarkerScanner:
"где описано",
"документ с описанием",
)
_ARCHITECTURE_MARKERS = ("архитектура", "как устроено приложение", "как устроен сервис", "основные части системы", "из чего состоит")
_LOGIC_MARKERS = ("цикл", "loop", "worker", "как работает отправка уведомлений", "логика отправки", "background job", "runtime loop")
_ARCHITECTURE_MARKERS = (
"архитектура",
"архитектур",
"architecture",
"arch overview",
"как устроено приложение",
"как устроен сервис",
"основные части системы",
"из чего состоит",
)
_LOGIC_MARKERS = (
"цикл",
"loop",
"flow",
"workflow",
"process",
"worker",
"как работает отправка уведомлений",
"логика отправки",
"background job",
"runtime loop",
)
_DOMAIN_MARKERS = ("runtime health", "health model", "статусы здоровья", "сущность", "entity", "здоровье runtime")
_ENDPOINT_MARKERS = ("endpoint", "метод api", "ручка", "эндпоинт")
_ENDPOINT_MARKERS = (
"endpoint",
"api",
"route",
"method",
"метод api",
"метод",
"метода",
"ручка",
"эндпоинт",
"маршрут",
"роут",
)
def scan(self, lowered_query: str) -> dict[str, list[str]]:
return {
@@ -54,12 +86,13 @@ class _MarkerScanner:
class _EntityNameExtractor:
_ENTITY_RE = re.compile(r"\b[A-Z][A-Za-z0-9_]+\b")
_IGNORE = {"arch"}
def extract(self, query: str) -> list[str]:
items: list[str] = []
for match in self._ENTITY_RE.finditer(query):
candidate = match.group(0).strip()
if candidate and candidate not in items:
if candidate and candidate.lower() not in self._IGNORE and candidate not in items:
items.append(candidate)
return items
@@ -92,33 +125,61 @@ class _FileNameExtractor:
items.append(value)
class _ProcessAnchorExtractor:
_DOMAIN_KEYWORDS = {
"billing": "billing",
"notifications": "notifications",
}
_SUBDOMAIN_KEYWORDS = {
"invoice": ("billing", "invoice"),
"invoices": ("billing", "invoice"),
"delivery_loop": ("notifications", "delivery_loop"),
"delivery": ("notifications", "delivery_loop"),
}
def extract(self, lowered_query: str) -> tuple[str | None, str | None]:
domain = next((value for token, value in self._DOMAIN_KEYWORDS.items() if token in lowered_query), None)
subdomain: str | None = None
for token, mapping in self._SUBDOMAIN_KEYWORDS.items():
if token in lowered_query:
domain = domain or mapping[0]
subdomain = mapping[1]
break
return domain, subdomain
class V2AnchorExtractor:
def __init__(
self,
marker_scanner: _MarkerScanner | None = None,
entity_extractor: _EntityNameExtractor | None = None,
file_name_extractor: _FileNameExtractor | None = None,
process_anchor_extractor: _ProcessAnchorExtractor | None = None,
) -> None:
self._marker_scanner = marker_scanner or _MarkerScanner()
self._entity_extractor = entity_extractor or _EntityNameExtractor()
self._file_name_extractor = file_name_extractor or _FileNameExtractor()
self._process_anchor_extractor = process_anchor_extractor or _ProcessAnchorExtractor()
def extract(self, normalized_query: str, terms: TargetTermsAnalysis) -> AnchorAnalysis:
markers = self._marker_scanner.scan(normalized_query.lower())
lowered_query = normalized_query.lower()
markers = self._marker_scanner.scan(lowered_query)
process_domain, process_subdomain = self._process_anchor_extractor.extract(lowered_query)
anchors = V2RouteAnchors(
entity_names=self._entity_extractor.extract(normalized_query),
file_names=self._file_name_extractor.extract(normalized_query),
endpoint_paths=list(terms.endpoint_paths),
target_doc_hints=self._target_doc_hints(
endpoint_paths=terms.endpoint_paths,
api_like_terms=terms.api_like_terms,
alias_docs=terms.alias_docs,
architecture_markers=markers["architecture_markers"],
logic_markers=markers["logic_markers"],
domain_markers=markers["domain_markers"],
),
matched_aliases=list(terms.matched_aliases),
process_domain=None,
process_subdomain=None,
process_domain=process_domain,
process_subdomain=process_subdomain,
)
return AnchorAnalysis(
anchors=anchors,
@@ -133,6 +194,7 @@ class V2AnchorExtractor:
self,
*,
endpoint_paths: list[str],
api_like_terms: list[str],
alias_docs: list[str],
architecture_markers: list[str],
logic_markers: list[str],
@@ -145,13 +207,41 @@ class V2AnchorExtractor:
"/actions/{action}": "docs/api/control-actions-endpoint.md",
}
for endpoint in endpoint_paths:
for hint in self._endpoint_hint_variants(endpoint):
self._append_unique(hints, hint)
hint = endpoint_map.get(endpoint)
if hint and hint not in hints:
hints.append(hint)
if architecture_markers and "docs/architecture/telegram-notify-app-overview.md" not in hints:
hints.append("docs/architecture/telegram-notify-app-overview.md")
if logic_markers and "docs/logic/telegram-notification-loop.md" not in hints:
hints.append("docs/logic/telegram-notification-loop.md")
if domain_markers and "docs/domains/runtime-health-entity.md" not in hints:
hints.append("docs/domains/runtime-health-entity.md")
self._append_unique(hints, hint)
for term in api_like_terms:
for hint in self._api_like_hint_variants(term):
self._append_unique(hints, hint)
if architecture_markers:
self._append_unique(hints, "docs/architecture/telegram-notify-app-overview.md")
if logic_markers:
self._append_unique(hints, "docs/logic/telegram-notification-loop.md")
if domain_markers:
self._append_unique(hints, "docs/domains/runtime-health-entity.md")
return hints
def _endpoint_hint_variants(self, endpoint: str) -> list[str]:
normalized = str(endpoint or "").strip().lower()
if not normalized:
return []
slug = normalized.strip("/").replace("/", "-").replace("{", "").replace("}", "")
leaf = next((part for part in reversed(slug.split("-")) if part and part != "id"), "")
hints: list[str] = [normalized]
for value in (slug, leaf):
if not value:
continue
hints.extend([value, f"{value}-endpoint", f"{value} endpoint"])
return list(dict.fromkeys(hints))
def _api_like_hint_variants(self, term: str) -> list[str]:
normalized = str(term or "").strip().lower().lstrip("/")
if not normalized:
return []
return [normalized, f"/{normalized}", f"{normalized}-endpoint", f"{normalized} endpoint"]
def _append_unique(self, items: list[str], value: str | None) -> None:
normalized = str(value or "").strip()
if normalized and normalized not in items:
items.append(normalized)
@@ -8,6 +8,7 @@ from dataclasses import dataclass
class TargetTermsAnalysis:
target_terms: list[str]
endpoint_paths: list[str]
api_like_terms: list[str]
matched_aliases: list[str]
alias_docs: list[str]
@@ -26,7 +27,7 @@ class _AliasMatcher:
_AliasRule(("control actions", "управление runtime"), "/actions/{action}", "docs/api/control-actions-endpoint.md"),
_AliasRule(("runtime health", "здоровье runtime", "статусы здоровья"), "runtime_health", "docs/domains/runtime-health-entity.md"),
_AliasRule(("цикл отправки уведомлений", "notification loop", "worker loop"), "telegram-notify-loop", "docs/logic/telegram-notification-loop.md"),
_AliasRule(("архитектура приложения", "overview"), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
_AliasRule(("архитектура приложения",), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
_AliasRule(("архитектура",), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
_AliasRule(("каталог ошибок", "errors catalog"), "errors_catalog", "docs/errors/catalog.yaml"),
_AliasRule(("файл-индекс документации", "docs index", "индекс документации"), "docs_index", "docs/README.md"),
@@ -51,6 +52,7 @@ class _AliasMatcher:
class _EndpointPathExtractor:
_PATH_RE = re.compile(r"`([^`]+)`|(/[A-Za-z0-9_./{}-]+)")
_VALID_ENDPOINT_RE = re.compile(r"^/[a-z0-9._/-]+(?:/\{[a-z0-9_]+\})?$")
_DOC_EXTENSIONS = (".md", ".yaml", ".yml", ".json")
def extract(self, query: str) -> list[str]:
values: list[str] = []
@@ -68,28 +70,161 @@ class _EndpointPathExtractor:
return trimmed.lower()
def _is_endpoint(self, token: str) -> bool:
return bool(token and self._VALID_ENDPOINT_RE.fullmatch(token))
if not token or not self._VALID_ENDPOINT_RE.fullmatch(token):
return False
return not token.endswith(self._DOC_EXTENSIONS)
def _append_unique(self, items: list[str], value: str) -> None:
if value and value not in items:
items.append(value)
@dataclass(slots=True)
class _ApiLikeAnchorAnalysis:
endpoint_paths: list[str]
candidate_terms: list[str]
class _ApiLikeAnchorExtractor:
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-я0-9_./{}-]+")
_ASCII_ENDPOINT_RE = re.compile(r"^[a-z0-9]+(?:[-_][a-z0-9]+)*$")
_API_MARKERS = {
"api",
"endpoint",
"route",
"method",
"метод",
"метода",
"методу",
"ручка",
"ручки",
"эндпоинт",
"эндпоинта",
"маршрут",
"роут",
}
_EXPLAIN_MARKERS = {
"как",
"что",
"делает",
"работает",
"объясни",
"объяснить",
"расскажи",
"опиши",
"смысл",
}
_NOISE_WORDS = _API_MARKERS | _EXPLAIN_MARKERS | {
"про",
"какой",
"какая",
"какие",
"какого",
"какую",
"кратко",
"нужен",
"нужно",
"у",
}
_SHORT_QUERY_TOKEN_LIMIT = 7
def extract(self, query: str, explicit_endpoint_paths: list[str]) -> _ApiLikeAnchorAnalysis:
if explicit_endpoint_paths:
return _ApiLikeAnchorAnalysis(endpoint_paths=list(explicit_endpoint_paths), candidate_terms=[])
token_entries = self._token_entries(query)
if not token_entries:
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
candidate_terms = [token for token, _start in token_entries if self._is_api_candidate(token)]
if not candidate_terms:
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
if self._has_api_marker(token_entries):
primary = self._primary_candidate(token_entries)
endpoint_paths = [self._ensure_endpoint(primary)] if primary else []
return _ApiLikeAnchorAnalysis(
endpoint_paths=[path for path in endpoint_paths if path],
candidate_terms=[primary] if primary else [],
)
if self._is_short_explain_query(token_entries) and len(candidate_terms) == 1:
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=list(candidate_terms))
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
def _token_entries(self, query: str) -> list[tuple[str, int]]:
entries: list[tuple[str, int]] = []
for match in self._TOKEN_RE.finditer(query):
token = str(match.group(0) or "").strip().strip("`'\"()[]!?.,:;").lower()
if token:
entries.append((token, match.start()))
return entries
def _has_api_marker(self, token_entries: list[tuple[str, int]]) -> bool:
return any(token in self._API_MARKERS for token, _start in token_entries)
def _is_short_explain_query(self, token_entries: list[tuple[str, int]]) -> bool:
if len(token_entries) > self._SHORT_QUERY_TOKEN_LIMIT:
return False
return any(token in self._EXPLAIN_MARKERS for token, _start in token_entries)
def _primary_candidate(self, token_entries: list[tuple[str, int]]) -> str | None:
marker_positions = [start for token, start in token_entries if token in self._API_MARKERS]
candidates = [(token, start) for token, start in token_entries if self._is_api_candidate(token)]
if not candidates:
return None
if not marker_positions:
return candidates[-1][0]
primary = min(
candidates,
key=lambda item: min(abs(item[1] - marker_pos) for marker_pos in marker_positions),
)
return primary[0]
def _is_api_candidate(self, token: str) -> bool:
if (
not token
or token in self._NOISE_WORDS
or token.startswith("docs/")
or token.endswith((".md", ".yaml", ".yml", ".json"))
):
return False
if token.startswith("/"):
return True
return self._ASCII_ENDPOINT_RE.fullmatch(token) is not None and len(token) >= 3
def _ensure_endpoint(self, token: str) -> str:
return token if token.startswith("/") else f"/{token}"
class _TermCollector:
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-я0-9_./{}-]+")
_IDENTIFIER_RE = re.compile(
r"^(?:[a-z0-9]+(?:[_-][a-z0-9]+)+|[a-z]+[A-Z][A-Za-z0-9]+|(?:[A-Z][a-z0-9]+){2,})$"
)
_QUESTION_WORDS = {"что", "как", "где", "какой", "какие", "каком", "когда", "чего"}
_INTENT_WORDS = {"объясни", "покажи", "найди", "расскажи", "дай", "опиши", "нужен"}
_FILLER_WORDS = {"про", "там", "тут", "плз"}
_INTENT_WORDS = {"объясни", "покажи", "найди", "расскажи", "дай", "опиши", "нужен", "show"}
_FILLER_WORDS = {"про", "там", "тут", "плз", "pls", "for"}
_MARKER_WORDS = {
"файл",
"файле",
"file",
"method",
"метод",
"метода",
"методу",
"route",
"ручка",
"ручки",
"эндпоинт",
"эндпоинта",
"overview",
"architecture",
"arch",
"flow",
"process",
"workflow",
"док",
"дока",
"доках",
"документ",
"doc",
"описан",
"док-саммари",
"summary",
@@ -115,6 +250,7 @@ class _TermCollector:
"service",
"summary",
"endpoint",
"docs",
}
_MAX_TERMS = 7
@@ -191,19 +327,23 @@ class V2TargetTermsExtractor:
self,
alias_matcher: _AliasMatcher | None = None,
endpoint_extractor: _EndpointPathExtractor | None = None,
api_like_extractor: _ApiLikeAnchorExtractor | None = None,
term_collector: _TermCollector | None = None,
) -> None:
self._alias_matcher = alias_matcher or _AliasMatcher()
self._endpoint_extractor = endpoint_extractor or _EndpointPathExtractor()
self._api_like_extractor = api_like_extractor or _ApiLikeAnchorExtractor()
self._term_collector = term_collector or _TermCollector()
def extract(self, normalized_query: str) -> TargetTermsAnalysis:
lowered = normalized_query.lower()
endpoint_paths = self._endpoint_extractor.extract(normalized_query)
api_like = self._api_like_extractor.extract(normalized_query, endpoint_paths)
alias_terms, alias_docs, alias_hits = self._alias_matcher.match(lowered)
return TargetTermsAnalysis(
target_terms=self._term_collector.collect(normalized_query, alias_terms, endpoint_paths),
endpoint_paths=endpoint_paths,
target_terms=self._term_collector.collect(normalized_query, alias_terms, api_like.endpoint_paths),
endpoint_paths=api_like.endpoint_paths,
api_like_terms=api_like.candidate_terms,
matched_aliases=alias_hits,
alias_docs=alias_docs,
)
@@ -44,6 +44,7 @@ class V2IntentRouter:
normalized_query=normalized_query,
target_terms=list(target_terms_analysis.target_terms),
endpoint_paths=list(target_terms_analysis.endpoint_paths),
file_names=list(anchor_analysis.anchors.file_names),
matched_aliases=list(target_terms_analysis.matched_aliases),
target_doc_hints=list(anchor_analysis.anchors.target_doc_hints),
file_markers=list(anchor_analysis.file_markers),
@@ -58,6 +59,7 @@ class V2IntentRouter:
anchors=anchor_analysis.anchors,
)
llm_result = self._validator.validate(llm_candidate)
llm_result = self._apply_deterministic_corrections(llm_result, features)
if llm_result is not None:
confidence = self._confidence_adjuster.adjust(float(llm_result["confidence"]), features)
return V2RouteResult(
@@ -99,3 +101,18 @@ class V2IntentRouter:
)
except Exception:
return None
def _apply_deterministic_corrections(self, candidate: dict | None, features: QueryFeatures) -> dict | None:
if candidate is None:
return None
if candidate.get("routing_domain") == "DOCS" and self._should_force_find_files(features):
corrected = dict(candidate)
corrected["subintent"] = "FIND_FILES"
return corrected
return candidate
def _should_force_find_files(self, features: QueryFeatures) -> bool:
if features.file_markers or features.file_names:
return True
query = features.normalized_query.lower()
return "show doc" in query or "show file" in query or "doc for" in query
@@ -6,7 +6,7 @@ from app.core.agent.processes.v2.models import V2Subintent
class DocsSubintentResolver:
def resolve(self, features: QueryFeatures) -> str | None:
if features.file_markers:
if features.file_markers or self._has_file_like_anchor(features):
return V2Subintent.FIND_FILES
if any(
(
@@ -20,3 +20,9 @@ class DocsSubintentResolver:
):
return V2Subintent.SUMMARY
return None
def _has_file_like_anchor(self, features: QueryFeatures) -> bool:
return any(
hint.endswith((".md", ".yaml", ".yml", ".json"))
for hint in features.target_doc_hints
) or any(token.endswith((".md", ".yaml", ".yml", ".json")) for token in features.file_names)
+7 -60
View File
@@ -14,7 +14,6 @@ from app.core.agent.processes.v2.retrieval.target_doc_seeding import (
merge_row_lists,
normalize_doc_path,
normalized_path_set,
path_variants_for_rag_query,
row_path,
seed_candidates_from_target_hints,
)
@@ -121,11 +120,9 @@ class V2Process(AgentProcess):
"retrieval_profile_selected",
{"profile": plan.profile, "layers": plan.layers, "filters": plan.filters},
)
seeded_rows = await self._seed_candidates_from_target_hints(rag_session_id, plan.layers, route)
semantic_rows = await self._rag_adapter.fetch_rows(rag_session_id, route.normalized_query, plan)
metadata_rows = self._metadata_lookup_candidates([*seeded_rows, *semantic_rows], route)
rows = self._merge_candidate_rows(seeded_rows, metadata_rows, semantic_rows)
rows = await self._ensure_target_hints_in_pool(rag_session_id, rows, route)
retrieved_rows = await self._rag_adapter.fetch_rows(rag_session_id, route.normalized_query, plan)
metadata_rows = self._metadata_lookup_candidates(retrieved_rows, route)
rows = self._merge_candidate_rows(retrieved_rows, metadata_rows)
rows = seed_candidates_from_target_hints(rows, route.anchors.target_doc_hints, RagRowIndex(rows))
self._print_missing_target_hints(route, rows)
context.trace.module("process.v2.rag_retrieval").log(
@@ -150,9 +147,9 @@ class V2Process(AgentProcess):
"target_doc_hints": route.anchors.target_doc_hints,
"candidate_docs_before_ranking": [self._trace_row(row) for row in rows[:8]],
"sources": {
"seeded": [self._trace_row(row) for row in seeded_rows[:5]],
"seeded": [self._trace_row(row) for row in retrieved_rows[:5] if row_path(row) in {normalize_doc_path(h) for h in route.anchors.target_doc_hints}],
"metadata_lookup": [self._trace_row(row) for row in metadata_rows[:5]],
"semantic": [self._trace_row(row) for row in semantic_rows[:5]],
"semantic": [self._trace_row(row) for row in retrieved_rows[:5]],
},
},
)
@@ -262,61 +259,11 @@ class V2Process(AgentProcess):
if not str(hint or "").strip():
continue
normalized = normalize_doc_path(hint)
if not normalized.startswith("docs/") or "." not in normalized.rsplit("/", 1)[-1]:
continue
if normalized not in candidate_paths:
print("ERROR: target doc missing from candidates:", normalized)
async def _ensure_target_hints_in_pool(self, rag_session_id: str, rows: list[dict], route) -> list[dict]:
hints_raw = [str(item).strip() for item in route.anchors.target_doc_hints if str(item or "").strip()]
if not hints_raw:
return rows
pool = normalized_path_set(rows)
missing_hints = [h for h in hints_raw if normalize_doc_path(h) not in pool]
if not missing_hints:
return rows
variant_paths: list[str] = []
for h in missing_hints:
variant_paths.extend(path_variants_for_rag_query(h))
variant_paths = list(dict.fromkeys(variant_paths))
extra_exact = await self._rag_adapter.fetch_exact_paths(rag_session_id, paths=variant_paths, layers=None)
pool2 = normalized_path_set(extra_exact)
still_missing = [h for h in missing_hints if normalize_doc_path(h) not in pool2]
fallback_rows: list[dict] = []
if still_missing:
needles = [normalize_doc_path(h).split("/")[-1] for h in still_missing]
needles = list(dict.fromkeys(n for n in needles if n))
if needles:
fallback_rows = await self._rag_adapter.fetch_chunks_by_path_substrings(
rag_session_id,
path_needles=needles,
layers=None,
)
return merge_row_lists(rows, extra_exact, fallback_rows)
async def _seed_candidates_from_target_hints(self, rag_session_id: str, layers: list[str], route) -> list[dict]:
del layers # seed по пути должен видеть все слои (иначе D0-only чанки теряются при file_lookup).
hints_raw = [str(item).strip() for item in route.anchors.target_doc_hints if str(item or "").strip()]
if not hints_raw:
return []
variant_paths: list[str] = []
for h in hints_raw:
variant_paths.extend(path_variants_for_rag_query(h))
variant_paths = list(dict.fromkeys(variant_paths))
exact_rows = await self._rag_adapter.fetch_exact_paths(rag_session_id, paths=variant_paths, layers=None)
paths_found = normalized_path_set(exact_rows)
missing = [h for h in hints_raw if normalize_doc_path(h) not in paths_found]
if not missing:
return exact_rows
needles = [normalize_doc_path(h).split("/")[-1] for h in missing]
needles = list(dict.fromkeys(n for n in needles if n))
if not needles:
return exact_rows
fallback_rows = await self._rag_adapter.fetch_chunks_by_path_substrings(
rag_session_id,
path_needles=needles,
layers=None,
)
return merge_row_lists(exact_rows, fallback_rows)
def _metadata_lookup_candidates(self, rows: list[dict], route) -> list[dict]:
return DocsMetadataLookupIndex(rows).lookup(route)
@@ -1,4 +1,4 @@
"""Intent-aware retrieval policy resolver для процесса v2."""
"""Intent-aware retrieval policy resolver for process v2."""
from __future__ import annotations
@@ -8,91 +8,113 @@ from app.core.rag.contracts.enums import RagLayer
from app.core.rag.retrieval.session_retriever import RetrievalPlan
class V2RetrievalPolicyResolver:
_SUMMARY_LAYERS = [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_ENTITY_CATALOG,
RagLayer.DOCS_DOC_CHUNKS,
]
_GENERAL_LAYERS = [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_DOC_CHUNKS,
class _AnchorTermCollector:
def prefer_like_patterns(self, route: V2RouteResult) -> list[str]:
terms = self._hint_basenames(route)
terms.extend(route.anchors.endpoint_paths)
terms.extend(route.target_terms)
terms.extend(route.anchors.file_names)
terms.extend(route.anchors.entity_names)
terms.extend(route.anchors.matched_aliases)
terms.extend(self._process_terms(route))
return [f"%{term.lower()}%" for term in _unique_terms(terms)]
def find_files_patterns(self, route: V2RouteResult) -> list[str]:
if route.anchors.target_doc_hints:
return [f"%{name.lower()}%" for name in self._hint_basenames(route)]
return self.prefer_like_patterns(route)
def api_method_patterns(self, route: V2RouteResult) -> list[str]:
terms = self._hint_basenames(route)
terms.extend(route.anchors.target_doc_hints)
terms.extend(route.anchors.endpoint_paths)
terms.extend(route.target_terms)
patterns: list[str] = []
for term in _unique_terms(terms):
lowered = term.lower()
stripped = lowered.strip("/")
if stripped:
patterns.append(f"%{stripped}%")
if lowered:
patterns.append(f"%{lowered}%")
return _unique_terms(patterns)
def _hint_basenames(self, route: V2RouteResult) -> list[str]:
return [hint.rsplit("/", 1)[-1] for hint in route.anchors.target_doc_hints if str(hint).strip()]
def _process_terms(self, route: V2RouteResult) -> list[str]:
terms: list[str] = []
if route.anchors.process_domain:
terms.append(route.anchors.process_domain)
if route.anchors.process_subdomain:
terms.append(route.anchors.process_subdomain)
return terms
class _RouteFilterBuilder:
_API_DOC_PREFIXES = [
"docs/api/",
"docs/endpoints/",
"docs/methods/",
"api/",
"endpoints/",
"methods/",
]
def resolve(self, route: V2RouteResult) -> RetrievalPlan:
if route.intent == V2Intent.GENERAL_QA:
return RetrievalPlan(
profile="general_qa_grounded_summary",
layers=list(self._GENERAL_LAYERS),
limit=8,
filters=self._general_filters(route),
)
if route.subintent == V2Subintent.FIND_FILES:
return RetrievalPlan(
profile="file_lookup",
layers=[RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_ENTITY_CATALOG],
limit=12,
filters=self._find_files_filters(route),
)
return RetrievalPlan(
profile=self._summary_profile(route),
layers=list(self._SUMMARY_LAYERS),
limit=8,
filters=self._summary_filters(route),
)
def __init__(self) -> None:
self._terms = _AnchorTermCollector()
def _summary_profile(self, route: V2RouteResult) -> str:
signals = anchor_signal_types(route)
if len(signals - {V2AnchorType.FIND_FILES}) != 1:
return "docs_summary_generic"
mapping = {
V2AnchorType.API_ENDPOINT: "docs_summary_api_endpoint",
V2AnchorType.ARCHITECTURE: "docs_summary_architecture",
V2AnchorType.LOGIC_FLOW: "docs_summary_logic_flow",
V2AnchorType.DOMAIN_ENTITY: "docs_summary_domain_entity",
}
signal = next(iter(signals - {V2AnchorType.FIND_FILES}), None)
return mapping.get(signal, "docs_summary_generic")
def _general_filters(self, route: V2RouteResult) -> dict[str, object]:
def general_filters(self, route: V2RouteResult) -> dict[str, object]:
return {
"prefer_path_prefixes": ["docs/architecture/", "docs/"],
"prefer_like_patterns": ["%README.md%", "%overview%"],
"prefer_like_patterns": ["%readme.md%", "%overview%"],
"target_doc_hints": list(route.anchors.target_doc_hints),
}
def _summary_filters(self, route: V2RouteResult) -> dict[str, object]:
filters: dict[str, object] = {
"prefer_path_prefixes": self._summary_prefixes(route),
"prefer_like_patterns": self._prefer_like_patterns(route),
"target_doc_hints": list(route.anchors.target_doc_hints),
}
def summary_filters(self, route: V2RouteResult) -> dict[str, object]:
if _is_api_method_explain(route):
return self.api_method_filters(route)
filters = self._base_filters(route)
filters["prefer_path_prefixes"] = self._summary_prefixes(route)
filters["prefer_like_patterns"] = self._terms.prefer_like_patterns(route)
if V2AnchorType.API_ENDPOINT in anchor_signal_types(route):
filters["path_prefixes"] = ["docs/api/", "docs/architecture/", "docs/"]
filters["path_prefixes"] = ["docs/api/", "docs/"]
return filters
def _find_files_filters(self, route: V2RouteResult) -> dict[str, object]:
def api_method_filters(self, route: V2RouteResult) -> dict[str, object]:
filters = self._base_filters(route)
filters["path_prefixes"] = list(self._API_DOC_PREFIXES)
filters["prefer_path_prefixes"] = list(self._API_DOC_PREFIXES)
filters["prefer_like_patterns"] = self._terms.api_method_patterns(route)
return filters
def find_files_filters(self, route: V2RouteResult) -> dict[str, object]:
filters = self._base_filters(route)
prefixes = self._find_files_prefixes(route)
if prefixes:
filters["path_prefixes"] = prefixes
filters["prefer_path_prefixes"] = self._find_files_prefer_prefixes(route, prefixes)
filters["prefer_like_patterns"] = self._terms.find_files_patterns(route)
return filters
def _base_filters(self, route: V2RouteResult) -> dict[str, object]:
filters: dict[str, object] = {
"prefer_path_prefixes": self._find_files_prefixes(route),
"prefer_like_patterns": self._prefer_like_patterns(route),
"target_doc_hints": list(route.anchors.target_doc_hints),
}
if route.anchors.target_doc_hints:
filters["prefer_like_patterns"] = [f"%{path.split('/')[-1]}%" for path in route.anchors.target_doc_hints]
if route.anchors.process_domain:
filters["metadata.domain"] = route.anchors.process_domain
if route.anchors.process_subdomain:
filters["metadata.subdomain"] = route.anchors.process_subdomain
return filters
def _prefer_like_patterns(self, route: V2RouteResult) -> list[str]:
patterns: list[str] = []
for path in route.anchors.target_doc_hints:
patterns.append(f"%{path.split('/')[-1]}%")
for endpoint in route.anchors.endpoint_paths:
patterns.append(f"%{endpoint}%")
return patterns
def _find_files_prefixes(self, route: V2RouteResult) -> list[str]:
if route.anchors.target_doc_hints:
prefixes = ["/".join(path.split("/")[:-1]) + "/" for path in route.anchors.target_doc_hints]
return [prefix for prefix in prefixes if prefix]
hint_prefixes = _prefixes_from_paths(route.anchors.target_doc_hints)
if hint_prefixes:
return hint_prefixes
file_prefixes = [name for name in route.anchors.file_names if str(name).strip().startswith("docs/")]
derived = _prefixes_from_paths(file_prefixes)
if derived:
return derived
signals = anchor_signal_types(route)
if V2AnchorType.API_ENDPOINT in signals:
return ["docs/api/", "docs/"]
@@ -104,6 +126,12 @@ class V2RetrievalPolicyResolver:
return ["docs/domains/", "docs/"]
return ["docs/"]
def _find_files_prefer_prefixes(self, route: V2RouteResult, prefixes: list[str]) -> list[str]:
preferred = list(prefixes)
if route.anchors.process_domain or route.anchors.process_subdomain:
preferred.extend(["docs/domains/", "docs/logic/"])
return _unique_terms(preferred or ["docs/"])
def _summary_prefixes(self, route: V2RouteResult) -> list[str]:
signals = anchor_signal_types(route)
prefixes: list[str] = []
@@ -114,5 +142,129 @@ class V2RetrievalPolicyResolver:
if V2AnchorType.LOGIC_FLOW in signals:
prefixes.extend(["docs/logic/", "docs/architecture/", "docs/"])
if V2AnchorType.DOMAIN_ENTITY in signals:
prefixes.extend(["docs/domains/", "docs/api/", "docs/architecture/"])
return list(dict.fromkeys(prefixes or ["docs/"]))
prefixes.extend(["docs/domains/", "docs/", "docs/api/"])
return _unique_terms(prefixes or ["docs/"])
class V2RetrievalPolicyResolver:
_GENERAL_LAYERS = [RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_DOC_CHUNKS]
_FIND_FILES_LAYERS = [RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_ENTITY_CATALOG]
_SUMMARY_LAYERS = {
"docs_api_method_explain": [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_FACT_INDEX,
RagLayer.DOCS_DOC_CHUNKS,
],
"docs_summary_api_endpoint": [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_FACT_INDEX,
RagLayer.DOCS_DOC_CHUNKS,
],
"docs_summary_logic_flow": [
RagLayer.DOCS_WORKFLOW_INDEX,
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_DOC_CHUNKS,
],
"docs_summary_domain_entity": [
RagLayer.DOCS_ENTITY_CATALOG,
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_DOC_CHUNKS,
],
"docs_summary_architecture": [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_RELATION_GRAPH,
RagLayer.DOCS_DOC_CHUNKS,
],
"docs_summary_generic": [
RagLayer.DOCS_DOCUMENT_CATALOG,
RagLayer.DOCS_DOC_CHUNKS,
],
}
def __init__(self) -> None:
self._filters = _RouteFilterBuilder()
def resolve(self, route: V2RouteResult) -> RetrievalPlan:
if route.intent == V2Intent.GENERAL_QA:
return RetrievalPlan(
profile="general_qa_grounded_summary",
layers=list(self._GENERAL_LAYERS),
limit=8,
filters=self._filters.general_filters(route),
)
if route.subintent == V2Subintent.FIND_FILES:
return RetrievalPlan(
profile="file_lookup",
layers=list(self._FIND_FILES_LAYERS),
limit=12,
filters=self._filters.find_files_filters(route),
)
profile = self._summary_profile(route)
return RetrievalPlan(
profile=profile,
layers=list(self._SUMMARY_LAYERS[profile]),
limit=10 if profile == "docs_api_method_explain" else 8,
filters=self._filters.summary_filters(route),
)
def _summary_profile(self, route: V2RouteResult) -> str:
if _is_api_method_explain(route):
return "docs_api_method_explain"
meaningful = anchor_signal_types(route) - {V2AnchorType.FIND_FILES}
if len(meaningful) != 1:
return "docs_summary_generic"
mapping = {
V2AnchorType.API_ENDPOINT: "docs_summary_api_endpoint",
V2AnchorType.ARCHITECTURE: "docs_summary_architecture",
V2AnchorType.LOGIC_FLOW: "docs_summary_logic_flow",
V2AnchorType.DOMAIN_ENTITY: "docs_summary_domain_entity",
}
return mapping.get(next(iter(meaningful)), "docs_summary_generic")
def _prefixes_from_paths(paths: list[str]) -> list[str]:
prefixes = []
for path in paths:
value = str(path).strip().strip("/")
if "/" not in value:
continue
prefix = value.rsplit("/", 1)[0] + "/"
if prefix:
prefixes.append(prefix)
return _unique_terms(prefixes)
def _unique_terms(items: list[str]) -> list[str]:
seen: set[str] = set()
unique: list[str] = []
for raw in items:
value = str(raw or "").strip()
if not value or value in seen:
continue
seen.add(value)
unique.append(value)
return unique
def _is_api_method_explain(route: V2RouteResult) -> bool:
if route.subintent != V2Subintent.SUMMARY:
return False
if route.anchors.endpoint_paths:
return True
if _has_api_like_hints(route.anchors.target_doc_hints):
return True
return V2AnchorType.API_ENDPOINT in anchor_signal_types(route)
def _has_api_like_hints(hints: list[str]) -> bool:
for hint in hints:
value = str(hint or "").strip().lower()
if not value:
continue
if value.startswith("/"):
return True
if value.startswith(("docs/api/", "docs/endpoints/", "docs/methods/")):
return True
if "endpoint" in value or "method" in value:
return True
return False
@@ -1,18 +1,23 @@
"""Адаптер v2 к :class:`RagSessionRetriever` для подстановки в тестах."""
"""Адаптер v2 к :class:`RagSessionRetriever` с plan-driven execution strategy."""
from __future__ import annotations
from app.core.agent.processes.v2.retrieval.target_doc_seeding import (
merge_row_lists,
normalize_doc_path,
path_variants_for_rag_query,
)
from app.core.rag.retrieval.session_retriever import RagSessionRetriever, RetrievalPlan
class V2RagRetrievalAdapter:
"""Обёртка над :class:`RagSessionRetriever` для подмены в тестах."""
class _PlanDrivenRetrieval:
def __init__(self, retriever: RagSessionRetriever) -> None:
self._retriever = retriever
async def fetch_rows(self, rag_session_id: str, query_text: str, plan: RetrievalPlan) -> list[dict]:
return await self._retriever.retrieve(rag_session_id, query_text, plan)
seeded_rows = await self._seed_from_target_hints(rag_session_id, plan)
semantic_rows = await self._retriever.retrieve(rag_session_id, query_text, plan)
return merge_row_lists(seeded_rows, semantic_rows)
async def fetch_exact_paths(self, rag_session_id: str, *, paths: list[str], layers: list[str] | None = None) -> list[dict]:
return await self._retriever.retrieve_exact_files(rag_session_id, paths=paths, layers=layers)
@@ -31,3 +36,73 @@ class V2RagRetrievalAdapter:
layers=layers,
limit=limit,
)
async def _seed_from_target_hints(self, rag_session_id: str, plan: RetrievalPlan) -> list[dict]:
hints = self._target_doc_hints(plan)
if not hints:
return []
exact_rows = await self._fetch_exact_rows(rag_session_id, hints)
missing = self._missing_hints(hints, exact_rows)
if not missing:
return exact_rows
fallback_rows = await self._fetch_substring_rows(rag_session_id, missing)
return merge_row_lists(exact_rows, fallback_rows)
async def _fetch_exact_rows(self, rag_session_id: str, hints: list[str]) -> list[dict]:
variant_paths: list[str] = []
for hint in hints:
variant_paths.extend(path_variants_for_rag_query(hint))
unique_paths = list(dict.fromkeys(path for path in variant_paths if path))
if not unique_paths:
return []
return await self._retriever.retrieve_exact_files(rag_session_id, paths=unique_paths, layers=None)
async def _fetch_substring_rows(self, rag_session_id: str, hints: list[str]) -> list[dict]:
needles = [normalize_doc_path(hint).split("/")[-1] for hint in hints]
unique_needles = list(dict.fromkeys(needle for needle in needles if needle))
if not unique_needles:
return []
return await self._retriever.retrieve_chunks_by_path_substrings(
rag_session_id,
path_needles=unique_needles,
layers=None,
limit=200,
)
def _target_doc_hints(self, plan: RetrievalPlan) -> list[str]:
raw = plan.filters.get("target_doc_hints")
if not isinstance(raw, list):
return []
return [str(item).strip() for item in raw if str(item or "").strip()]
def _missing_hints(self, hints: list[str], rows: list[dict]) -> list[str]:
pool = {normalize_doc_path(str(row.get("path") or "")) for row in rows}
return [hint for hint in hints if normalize_doc_path(hint) not in pool]
class V2RagRetrievalAdapter:
"""Обёртка над :class:`RagSessionRetriever` для plan-driven retrieval и подмены в тестах."""
def __init__(self, retriever: RagSessionRetriever) -> None:
self._retriever = _PlanDrivenRetrieval(retriever)
async def fetch_rows(self, rag_session_id: str, query_text: str, plan: RetrievalPlan) -> list[dict]:
return await self._retriever.fetch_rows(rag_session_id, query_text, plan)
async def fetch_exact_paths(self, rag_session_id: str, *, paths: list[str], layers: list[str] | None = None) -> list[dict]:
return await self._retriever.fetch_exact_paths(rag_session_id, paths=paths, layers=layers)
async def fetch_chunks_by_path_substrings(
self,
rag_session_id: str,
*,
path_needles: list[str],
layers: list[str] | None = None,
limit: int = 200,
) -> list[dict]:
return await self._retriever.fetch_chunks_by_path_substrings(
rag_session_id,
path_needles=path_needles,
layers=layers,
limit=limit,
)
@@ -1,20 +1,24 @@
from __future__ import annotations
import logging
import yaml
from app.core.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
from app.core.rag.indexing.docs.models import IntegrationRecord
LOGGER = logging.getLogger(__name__)
class DocsIntegrationExtractor:
_SECTION_TITLES = {"integrations", "интеграции"}
def extract(self, sections: list[SectionChunk]) -> list[IntegrationRecord]:
def extract(self, sections: list[SectionChunk], *, path: str = "") -> list[IntegrationRecord]:
records: list[IntegrationRecord] = []
for section in sections:
if not self._is_integration_section(section.section_path):
continue
payload = self._payload(section.content)
payload = self._payload(section.content, path=path, section_path=section.section_path)
target = str(payload.get("target") or "").strip()
if not target:
continue
@@ -40,7 +44,7 @@ class DocsIntegrationExtractor:
parts = [item.strip().lower() for item in section_path.split(" > ") if item.strip()]
return any(part in self._SECTION_TITLES for part in parts[:-1]) or (parts and parts[-1] in self._SECTION_TITLES)
def _payload(self, text: str) -> dict:
def _payload(self, text: str, *, path: str, section_path: str) -> dict:
payload: dict = {}
details_lines: list[str] = []
collecting_details = False
@@ -61,15 +65,27 @@ class DocsIntegrationExtractor:
collecting_details = True
details_lines = []
if value:
payload[key] = self._yaml_value(value)
payload[key] = self._yaml_value(
value,
path=path,
section_path=section_path,
field_name=key,
fallback="",
)
continue
collecting_details = False
payload[key] = self._yaml_value(value)
payload[key] = self._yaml_value(
value,
path=path,
section_path=section_path,
field_name=key,
fallback=value,
)
if details_lines:
payload["details"] = self._details_payload(details_lines)
payload["details"] = self._details_payload(details_lines, path=path, section_path=section_path)
return payload
def _details_payload(self, lines: list[str]) -> dict:
def _details_payload(self, lines: list[str], *, path: str, section_path: str) -> dict:
normalized: list[str] = []
for raw_line in lines:
line = raw_line[2:] if raw_line.startswith(" ") else raw_line
@@ -78,7 +94,13 @@ class DocsIntegrationExtractor:
if indent == 0 and stripped.startswith("- "):
stripped = stripped[2:]
normalized.append((" " * indent) + stripped)
payload = yaml.safe_load("\n".join(normalized)) or {}
payload = self._yaml_value(
"\n".join(normalized),
path=path,
section_path=section_path,
field_name="details",
fallback={},
) or {}
return payload if isinstance(payload, dict) else {}
def _split_key_value(self, text: str) -> tuple[str, str]:
@@ -87,7 +109,17 @@ class DocsIntegrationExtractor:
key, value = text.split(":", 1)
return key.strip(), value.strip()
def _yaml_value(self, value: str):
def _yaml_value(self, value: str, *, path: str, section_path: str, field_name: str, fallback):
if not value:
return ""
return yaml.safe_load(value)
try:
return yaml.safe_load(value)
except yaml.YAMLError as exc:
LOGGER.warning(
"docs integration parse warning: path=%s section=%s field=%s reason=%s",
path or "<unknown>",
section_path,
field_name,
exc.__class__.__name__,
)
return fallback
+42 -5
View File
@@ -1,5 +1,8 @@
from __future__ import annotations
import logging
from collections.abc import Callable
from app.core.rag.contracts import RagDocument, RagSource
from app.core.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
from app.core.rag.indexing.docs.classifier import DocsClassifier
@@ -15,6 +18,8 @@ from app.core.rag.indexing.docs.relation_extractor import DocsRelationExtractor
from app.core.rag.indexing.docs.support_layer_builder import DocsSupportLayerBuilder
from app.core.rag.indexing.docs.workflow_extractor import DocsWorkflowExtractor
LOGGER = logging.getLogger(__name__)
class DocsIndexingPipeline:
def __init__(self) -> None:
@@ -59,7 +64,11 @@ class DocsIndexingPipeline:
for section in sections:
docs.append(self._builder.build_doc_chunk(source, section, parsed.frontmatter, doc_kind))
document_id = frontmatter_view.document_id or source.path
for fact in self._facts.extract(parsed.frontmatter, sections):
for fact in self._safe_extract(
extractor_name="fact_extractor",
path=path,
run=lambda: self._facts.extract(parsed.frontmatter, sections),
):
docs.append(
self._support_builder.build_fact(
source,
@@ -72,13 +81,29 @@ class DocsIndexingPipeline:
subdomain=frontmatter_view.subdomain,
)
)
for entity in self._entities.extract(parsed.frontmatter):
for entity in self._safe_extract(
extractor_name="entity_extractor",
path=path,
run=lambda: self._entities.extract(parsed.frontmatter),
):
docs.append(self._builder.build_entity_record(source, parsed.frontmatter, entity))
for workflow in self._workflows.extract(parsed.detail_sections):
for workflow in self._safe_extract(
extractor_name="workflow_extractor",
path=path,
run=lambda: self._workflows.extract(parsed.detail_sections),
):
docs.append(self._support_builder.build_workflow_record(source, parsed.frontmatter, workflow))
for edge in self._relations.extract(parsed.frontmatter, source_id=document_id):
for edge in self._safe_extract(
extractor_name="relation_extractor",
path=path,
run=lambda: self._relations.extract(parsed.frontmatter, source_id=document_id),
):
docs.append(self._support_builder.build_relation_record(source, parsed.frontmatter, edge))
for integration in self._integrations.extract(sections):
for integration in self._safe_extract(
extractor_name="integration_extractor",
path=path,
run=lambda: self._integrations.extract(sections, path=path),
):
docs.append(self._support_builder.build_integration_record(source, parsed.frontmatter, integration))
return docs
@@ -86,3 +111,15 @@ class DocsIndexingPipeline:
tail = path.rsplit("/", 1)[-1]
stem = tail.rsplit(".", 1)[0]
return stem.replace("-", " ").replace("_", " ").strip().title()
def _safe_extract(self, *, extractor_name: str, path: str, run: Callable[[], list]) -> list:
try:
return run()
except Exception as exc:
LOGGER.warning(
"docs pipeline extractor warning: path=%s extractor=%s reason=%s",
path,
extractor_name,
exc.__class__.__name__,
)
return []
@@ -25,6 +25,8 @@ class RagQueryRepository:
exclude_like_patterns: list[str] | None = None,
prefer_path_prefixes: list[str] | None = None,
prefer_like_patterns: list[str] | None = None,
metadata_domain: str | None = None,
metadata_subdomain: str | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
sql, params = self._builder.build_retrieve(
@@ -38,6 +40,8 @@ class RagQueryRepository:
exclude_like_patterns=exclude_like_patterns,
prefer_path_prefixes=prefer_path_prefixes,
prefer_like_patterns=prefer_like_patterns,
metadata_domain=metadata_domain,
metadata_subdomain=metadata_subdomain,
prefer_non_tests=prefer_non_tests,
)
with get_engine().connect() as conn:
@@ -234,6 +238,54 @@ class RagQueryRepository:
rows = conn.execute(stmt, params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def retrieve_chunks_by_path_substrings(
self,
rag_session_id: str,
*,
path_needles: list[str],
layers: list[str] | None = None,
limit: int = 200,
) -> list[dict]:
normalized_needles = [str(item).strip().lower() for item in path_needles if str(item).strip()]
if not normalized_needles:
return []
params: dict = {
"sid": rag_session_id,
"lim": max(1, int(limit)),
}
filters = ["rag_session_id = :sid"]
like_parts: list[str] = []
for idx, needle in enumerate(normalized_needles):
key = f"needle_{idx}"
params[key] = f"%{needle}%"
like_parts.append(f"lower(path) LIKE :{key}")
filters.append("(" + " OR ".join(like_parts) + ")")
if layers:
normalized_layers = [str(item).strip() for item in layers if str(item).strip()]
if normalized_layers:
params["layers"] = normalized_layers
filters.append("layer IN :layers")
stmt = text(
f"""
SELECT path, content, layer, title, metadata_json, span_start, span_end,
0 AS lexical_rank,
0 AS prefer_bonus,
0 AS test_penalty,
0 AS structural_rank,
0 AS layer_rank,
0 AS distance
FROM rag_chunks
WHERE {' AND '.join(filters)}
ORDER BY path ASC, COALESCE(span_start, 0) ASC, COALESCE(chunk_index, 0) ASC
LIMIT :lim
"""
)
if "layers" in params:
stmt = stmt.bindparams(bindparam("layers", expanding=True))
with get_engine().connect() as conn:
rows = conn.execute(stmt, params).mappings().fetchall()
return [self._row_to_dict(row) for row in rows]
def _row_to_dict(self, row) -> dict:
data = dict(row)
raw_metadata = data.pop("metadata_json")
@@ -69,6 +69,8 @@ class RagRepository:
exclude_like_patterns: list[str] | None = None,
prefer_path_prefixes: list[str] | None = None,
prefer_like_patterns: list[str] | None = None,
metadata_domain: str | None = None,
metadata_subdomain: str | None = None,
prefer_non_tests: bool = False,
) -> list[dict]:
return self._query.retrieve(
@@ -82,6 +84,8 @@ class RagRepository:
exclude_like_patterns=exclude_like_patterns,
prefer_path_prefixes=prefer_path_prefixes,
prefer_like_patterns=prefer_like_patterns,
metadata_domain=metadata_domain,
metadata_subdomain=metadata_subdomain,
prefer_non_tests=prefer_non_tests,
)
@@ -141,3 +145,18 @@ class RagRepository:
layers=layers,
limit=limit,
)
def retrieve_chunks_by_path_substrings(
self,
rag_session_id: str,
*,
path_needles: list[str],
layers: list[str] | None = None,
limit: int = 200,
) -> list[dict]:
return self._query.retrieve_chunks_by_path_substrings(
rag_session_id,
path_needles=path_needles,
layers=layers,
limit=limit,
)
@@ -19,6 +19,8 @@ class RetrievalStatementBuilder:
exclude_like_patterns: list[str] | None = None,
prefer_path_prefixes: list[str] | None = None,
prefer_like_patterns: list[str] | None = None,
metadata_domain: str | None = None,
metadata_subdomain: str | None = None,
prefer_non_tests: bool = False,
) -> tuple[str, dict]:
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
@@ -29,6 +31,8 @@ class RetrievalStatementBuilder:
self._append_prefix_group(filters, params, "path", path_prefixes)
self._append_prefix_group(filters, params, "exclude_prefix", exclude_path_prefixes, negate=True)
self._append_like_group(filters, params, "exclude_like", exclude_like_patterns, negate=True)
self._append_metadata_equals(filters, params, "metadata_domain", "domain", metadata_domain)
self._append_metadata_equals(filters, params, "metadata_subdomain", "subdomain", metadata_subdomain)
if layers:
filters.append("layer = ANY(:layers)")
params["layers"] = layers
@@ -202,6 +206,20 @@ class RetrievalStatementBuilder:
joined = " OR ".join(parts)
filters.append(f"NOT ({joined})" if negate else f"({joined})")
def _append_metadata_equals(
self,
filters: list[str],
params: dict,
param_key: str,
metadata_key: str,
value: str | None,
) -> None:
normalized = str(value or "").strip().lower()
if not normalized:
return
params[param_key] = normalized
filters.append(f"lower(COALESCE({self._metadata_text(metadata_key)}, '')) = :{param_key}")
def _test_penalty_sql(
self,
enabled: bool,
@@ -94,4 +94,8 @@ class RagSessionRetriever:
for key in keys:
if key in filters:
out[key] = filters[key]
if "metadata.domain" in filters:
out["metadata_domain"] = filters["metadata.domain"]
if "metadata.subdomain" in filters:
out["metadata_subdomain"] = filters["metadata.subdomain"]
return out