фиксирую состояние
This commit is contained in:
@@ -4,17 +4,17 @@ from app.core.agent.processes.v2.models import V2AnchorType, V2RouteAnchors, V2R
|
||||
|
||||
|
||||
def anchor_signal_types(route: V2RouteResult) -> set[str]:
|
||||
hints = [str(item).strip().lower() for item in route.anchors.target_doc_hints if str(item or "").strip()]
|
||||
texts = _signal_texts(route)
|
||||
signals: set[str] = set()
|
||||
if route.subintent == V2Subintent.FIND_FILES:
|
||||
signals.add(V2AnchorType.FIND_FILES)
|
||||
if route.anchors.endpoint_paths or _has_hint(hints, "/api/"):
|
||||
if route.anchors.endpoint_paths or _has_any(texts, ("/api/", "api", "endpoint")):
|
||||
signals.add(V2AnchorType.API_ENDPOINT)
|
||||
if _has_hint(hints, "/architecture/"):
|
||||
if _has_any(texts, ("/architecture/", "architecture", "arch")):
|
||||
signals.add(V2AnchorType.ARCHITECTURE)
|
||||
if _has_hint(hints, "/logic/"):
|
||||
if _has_any(texts, ("/logic/", "logic", "workflow", "flow", "process")):
|
||||
signals.add(V2AnchorType.LOGIC_FLOW)
|
||||
if _has_hint(hints, "/domains/"):
|
||||
if route.anchors.entity_names or _has_any(texts, ("/domains/", "domain", "entity", "component")):
|
||||
signals.add(V2AnchorType.DOMAIN_ENTITY)
|
||||
return signals
|
||||
|
||||
@@ -44,5 +44,14 @@ def anchors_have_signal(anchors: V2RouteAnchors, signal: str, *, subintent: str
|
||||
return signal in anchor_signal_types(route)
|
||||
|
||||
|
||||
def _has_hint(hints: list[str], marker: str) -> bool:
|
||||
return any(marker in hint for hint in hints)
|
||||
def _signal_texts(route: V2RouteResult) -> list[str]:
|
||||
items = [
|
||||
*route.anchors.target_doc_hints,
|
||||
*route.anchors.file_names,
|
||||
*route.anchors.matched_aliases,
|
||||
]
|
||||
return [str(item).strip().lower() for item in items if str(item or "").strip()]
|
||||
|
||||
|
||||
def _has_any(items: list[str], markers: tuple[str, ...]) -> bool:
|
||||
return any(marker in item for item in items for marker in markers)
|
||||
|
||||
@@ -11,6 +11,8 @@ from app.core.rag.contracts.enums import RagLayer
|
||||
|
||||
|
||||
class DocsEvidenceAssembler:
|
||||
_API_PATH_PREFIXES = ("docs/api/", "docs/endpoints/", "docs/methods/", "api/", "endpoints/", "methods/")
|
||||
_GENERIC_DOC_MARKERS = ("readme", "overview", "index", "navigation", "related docs", "catalog")
|
||||
def assemble_summaries(self, rows: list[dict], route: V2RouteResult) -> list[RetrievedSummary]:
|
||||
items = self._rank_rows(rows, route, mode="summary")
|
||||
ranked = [
|
||||
@@ -71,10 +73,12 @@ class DocsEvidenceAssembler:
|
||||
"score": score,
|
||||
"score_breakdown": breakdown,
|
||||
"match_reason": self._match_reason(breakdown),
|
||||
"is_generic_doc": self._is_generic_doc(path, self._title(row, path), self._summary(row), row),
|
||||
}
|
||||
)
|
||||
ranked.sort(key=lambda item: (-item["score"], item["path"]))
|
||||
return self._ensure_target_docs_in_top_k(ranked, route, k=4 if mode == "find_files" else 3)
|
||||
ranked = self._ensure_target_docs_in_top_k(ranked, route, k=4 if mode == "find_files" else 3)
|
||||
return self._promote_specific_primary(ranked, route)
|
||||
|
||||
def _score_breakdown(self, row: dict, route: V2RouteResult, *, mode: str) -> dict[str, int]:
|
||||
path_raw = self._path(row)
|
||||
@@ -93,6 +97,7 @@ class DocsEvidenceAssembler:
|
||||
"alias_match": 0,
|
||||
"anchor_boost": 0,
|
||||
"target_doc_boost": 0,
|
||||
"specificity_boost": 0,
|
||||
"generic_penalty": 0,
|
||||
}
|
||||
if route.intent == "GENERAL_QA":
|
||||
@@ -100,6 +105,7 @@ class DocsEvidenceAssembler:
|
||||
hint_norm_lower = {normalize_doc_path(h).lower() for h in route.anchors.target_doc_hints if str(h or "").strip()}
|
||||
if normalize_doc_path(path_raw).lower() in hint_norm_lower:
|
||||
breakdown["target_doc_boost"] += 1000
|
||||
hint_texts = [str(hint or "").strip().lower() for hint in route.anchors.target_doc_hints if str(hint or "").strip()]
|
||||
if any(alias.lower() in " ".join([path, title, summary, entity]) for alias in route.anchors.matched_aliases):
|
||||
breakdown["alias_match"] += 500
|
||||
for token in query_tokens:
|
||||
@@ -111,10 +117,25 @@ class DocsEvidenceAssembler:
|
||||
breakdown["semantic"] += 20
|
||||
if self._compact(token) in compact_haystack:
|
||||
breakdown["alias_match"] += 250
|
||||
for hint in hint_texts:
|
||||
compact_hint = self._compact(hint)
|
||||
if compact_hint and compact_hint in compact_haystack:
|
||||
breakdown["target_doc_boost"] += 180
|
||||
elif hint and hint.strip("/") in " ".join([path, title, summary, entity]):
|
||||
breakdown["semantic"] += 70
|
||||
endpoint_text = self._summary(row).lower()
|
||||
for endpoint in route.anchors.endpoint_paths:
|
||||
normalized_endpoint = endpoint.strip().lower()
|
||||
endpoint_slug = normalized_endpoint.strip("/")
|
||||
if normalized_endpoint and normalized_endpoint in endpoint_text:
|
||||
breakdown["target_doc_boost"] += 260
|
||||
if endpoint_slug and endpoint_slug in filename:
|
||||
breakdown["filename_match"] += 200
|
||||
if any(endpoint.strip("/").lower() in filename for endpoint in route.anchors.endpoint_paths):
|
||||
breakdown["filename_match"] += 200
|
||||
signals = anchor_signal_types(route)
|
||||
breakdown["anchor_boost"] += self._anchor_boost(path, signals)
|
||||
breakdown["specificity_boost"] += self._specificity_boost(row, path, title, summary, route)
|
||||
breakdown["generic_penalty"] += self._generic_penalty(path, signals)
|
||||
if mode == "find_files":
|
||||
breakdown["path_match"] *= 3
|
||||
@@ -125,8 +146,8 @@ class DocsEvidenceAssembler:
|
||||
|
||||
def _anchor_boost(self, path: str, signals: set[str]) -> int:
|
||||
boost = 0
|
||||
if V2AnchorType.API_ENDPOINT in signals and path.startswith("docs/api/"):
|
||||
boost += 300
|
||||
if V2AnchorType.API_ENDPOINT in signals and path.startswith(self._API_PATH_PREFIXES):
|
||||
boost += 360
|
||||
if V2AnchorType.LOGIC_FLOW in signals and path.startswith("docs/logic/"):
|
||||
boost += 300
|
||||
if V2AnchorType.DOMAIN_ENTITY in signals and path.startswith("docs/domains/"):
|
||||
@@ -139,8 +160,11 @@ class DocsEvidenceAssembler:
|
||||
|
||||
def _generic_penalty(self, path: str, signals: set[str]) -> int:
|
||||
penalty = 0
|
||||
lowered = path.lower()
|
||||
if path == "docs/README.md" and V2AnchorType.ARCHITECTURE not in signals:
|
||||
penalty -= 200
|
||||
penalty -= 260
|
||||
if any(marker in lowered for marker in ("/readme", "readme.md", "/index", "/overview", "/catalog", "/navigation")):
|
||||
penalty -= 220
|
||||
if "/architecture/" in path and V2AnchorType.ARCHITECTURE not in signals and signals.intersection(
|
||||
{V2AnchorType.API_ENDPOINT, V2AnchorType.DOMAIN_ENTITY}
|
||||
):
|
||||
@@ -173,6 +197,17 @@ class DocsEvidenceAssembler:
|
||||
top.sort(key=lambda item: (-item["score"], item["path"]))
|
||||
return top + remaining
|
||||
|
||||
def _promote_specific_primary(self, ranked: list[dict], route: V2RouteResult) -> list[dict]:
|
||||
if len(ranked) < 2:
|
||||
return ranked
|
||||
first = ranked[0]
|
||||
if not first.get("is_generic_doc"):
|
||||
return ranked
|
||||
promoted = next((item for item in ranked[1:] if not item.get("is_generic_doc") and self._is_specific_candidate(item, route)), None)
|
||||
if promoted is None:
|
||||
return ranked
|
||||
return [promoted] + [item for item in ranked if item["path"] != promoted["path"]]
|
||||
|
||||
def _match_reason(self, breakdown: dict[str, int]) -> str:
|
||||
if breakdown["target_doc_boost"] > 0:
|
||||
return "exact_path"
|
||||
@@ -189,6 +224,53 @@ class DocsEvidenceAssembler:
|
||||
section = str(metadata.get("section_path") or "").lower()
|
||||
return "summary" in section or "свод" in section or "overview" in section
|
||||
|
||||
def _specificity_boost(self, row: dict, path: str, title: str, summary: str, route: V2RouteResult) -> int:
|
||||
boost = 0
|
||||
filename = path.split("/")[-1]
|
||||
lowered_title = title.lower()
|
||||
lowered_summary = summary.lower()
|
||||
if not self._is_generic_doc(path, title, summary, row):
|
||||
boost += 90
|
||||
if path.startswith(self._API_PATH_PREFIXES):
|
||||
boost += 160
|
||||
if "endpoint" in filename or "endpoint" in lowered_title or "method" in lowered_title:
|
||||
boost += 120
|
||||
if row.get("layer") == RagLayer.DOCS_DOC_CHUNKS and not self._looks_like_navigation_chunk(row):
|
||||
boost += 80
|
||||
for token in self._query_tokens(route):
|
||||
if token and token in filename:
|
||||
boost += 90
|
||||
if token and token in lowered_title:
|
||||
boost += 70
|
||||
if token and token in lowered_summary:
|
||||
boost += 40
|
||||
return boost
|
||||
|
||||
def _is_specific_candidate(self, item: dict, route: V2RouteResult) -> bool:
|
||||
breakdown = dict(item.get("score_breakdown") or {})
|
||||
if breakdown.get("target_doc_boost", 0) > 0:
|
||||
return True
|
||||
if breakdown.get("specificity_boost", 0) >= 160:
|
||||
return True
|
||||
return V2AnchorType.API_ENDPOINT in anchor_signal_types(route) and item["path"].startswith(self._API_PATH_PREFIXES)
|
||||
|
||||
def _is_generic_doc(self, path: str, title: str, summary: str, row: dict) -> bool:
|
||||
haystack = " ".join([path.lower(), title.lower(), summary.lower()])
|
||||
if any(marker in haystack for marker in self._GENERIC_DOC_MARKERS):
|
||||
return True
|
||||
return self._looks_like_navigation_chunk(row)
|
||||
|
||||
def _looks_like_navigation_chunk(self, row: dict) -> bool:
|
||||
text = self._summary(row).lower()
|
||||
if not text:
|
||||
return False
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
bullet_lines = sum(1 for line in lines if line.startswith(("- ", "* ", "1.", "2.", "3.")))
|
||||
link_lines = sum(1 for line in lines if "](" in line or line.startswith("docs/"))
|
||||
if "related docs" in text or "navigation" in text:
|
||||
return True
|
||||
return bullet_lines >= 3 or link_lines >= 3
|
||||
|
||||
def _query_tokens(self, route: V2RouteResult) -> list[str]:
|
||||
values = list(route.target_terms) + list(route.anchors.matched_aliases)
|
||||
tokens: list[str] = []
|
||||
|
||||
@@ -8,6 +8,7 @@ class QueryFeatures:
|
||||
normalized_query: str
|
||||
target_terms: list[str]
|
||||
endpoint_paths: list[str]
|
||||
file_names: list[str]
|
||||
matched_aliases: list[str]
|
||||
target_doc_hints: list[str]
|
||||
file_markers: list[str]
|
||||
|
||||
@@ -34,10 +34,42 @@ class _MarkerScanner:
|
||||
"где описано",
|
||||
"документ с описанием",
|
||||
)
|
||||
_ARCHITECTURE_MARKERS = ("архитектура", "как устроено приложение", "как устроен сервис", "основные части системы", "из чего состоит")
|
||||
_LOGIC_MARKERS = ("цикл", "loop", "worker", "как работает отправка уведомлений", "логика отправки", "background job", "runtime loop")
|
||||
_ARCHITECTURE_MARKERS = (
|
||||
"архитектура",
|
||||
"архитектур",
|
||||
"architecture",
|
||||
"arch overview",
|
||||
"как устроено приложение",
|
||||
"как устроен сервис",
|
||||
"основные части системы",
|
||||
"из чего состоит",
|
||||
)
|
||||
_LOGIC_MARKERS = (
|
||||
"цикл",
|
||||
"loop",
|
||||
"flow",
|
||||
"workflow",
|
||||
"process",
|
||||
"worker",
|
||||
"как работает отправка уведомлений",
|
||||
"логика отправки",
|
||||
"background job",
|
||||
"runtime loop",
|
||||
)
|
||||
_DOMAIN_MARKERS = ("runtime health", "health model", "статусы здоровья", "сущность", "entity", "здоровье runtime")
|
||||
_ENDPOINT_MARKERS = ("endpoint", "метод api", "ручка", "эндпоинт")
|
||||
_ENDPOINT_MARKERS = (
|
||||
"endpoint",
|
||||
"api",
|
||||
"route",
|
||||
"method",
|
||||
"метод api",
|
||||
"метод",
|
||||
"метода",
|
||||
"ручка",
|
||||
"эндпоинт",
|
||||
"маршрут",
|
||||
"роут",
|
||||
)
|
||||
|
||||
def scan(self, lowered_query: str) -> dict[str, list[str]]:
|
||||
return {
|
||||
@@ -54,12 +86,13 @@ class _MarkerScanner:
|
||||
|
||||
class _EntityNameExtractor:
|
||||
_ENTITY_RE = re.compile(r"\b[A-Z][A-Za-z0-9_]+\b")
|
||||
_IGNORE = {"arch"}
|
||||
|
||||
def extract(self, query: str) -> list[str]:
|
||||
items: list[str] = []
|
||||
for match in self._ENTITY_RE.finditer(query):
|
||||
candidate = match.group(0).strip()
|
||||
if candidate and candidate not in items:
|
||||
if candidate and candidate.lower() not in self._IGNORE and candidate not in items:
|
||||
items.append(candidate)
|
||||
return items
|
||||
|
||||
@@ -92,33 +125,61 @@ class _FileNameExtractor:
|
||||
items.append(value)
|
||||
|
||||
|
||||
class _ProcessAnchorExtractor:
|
||||
_DOMAIN_KEYWORDS = {
|
||||
"billing": "billing",
|
||||
"notifications": "notifications",
|
||||
}
|
||||
_SUBDOMAIN_KEYWORDS = {
|
||||
"invoice": ("billing", "invoice"),
|
||||
"invoices": ("billing", "invoice"),
|
||||
"delivery_loop": ("notifications", "delivery_loop"),
|
||||
"delivery": ("notifications", "delivery_loop"),
|
||||
}
|
||||
|
||||
def extract(self, lowered_query: str) -> tuple[str | None, str | None]:
|
||||
domain = next((value for token, value in self._DOMAIN_KEYWORDS.items() if token in lowered_query), None)
|
||||
subdomain: str | None = None
|
||||
for token, mapping in self._SUBDOMAIN_KEYWORDS.items():
|
||||
if token in lowered_query:
|
||||
domain = domain or mapping[0]
|
||||
subdomain = mapping[1]
|
||||
break
|
||||
return domain, subdomain
|
||||
|
||||
|
||||
class V2AnchorExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
marker_scanner: _MarkerScanner | None = None,
|
||||
entity_extractor: _EntityNameExtractor | None = None,
|
||||
file_name_extractor: _FileNameExtractor | None = None,
|
||||
process_anchor_extractor: _ProcessAnchorExtractor | None = None,
|
||||
) -> None:
|
||||
self._marker_scanner = marker_scanner or _MarkerScanner()
|
||||
self._entity_extractor = entity_extractor or _EntityNameExtractor()
|
||||
self._file_name_extractor = file_name_extractor or _FileNameExtractor()
|
||||
self._process_anchor_extractor = process_anchor_extractor or _ProcessAnchorExtractor()
|
||||
|
||||
def extract(self, normalized_query: str, terms: TargetTermsAnalysis) -> AnchorAnalysis:
|
||||
markers = self._marker_scanner.scan(normalized_query.lower())
|
||||
lowered_query = normalized_query.lower()
|
||||
markers = self._marker_scanner.scan(lowered_query)
|
||||
process_domain, process_subdomain = self._process_anchor_extractor.extract(lowered_query)
|
||||
anchors = V2RouteAnchors(
|
||||
entity_names=self._entity_extractor.extract(normalized_query),
|
||||
file_names=self._file_name_extractor.extract(normalized_query),
|
||||
endpoint_paths=list(terms.endpoint_paths),
|
||||
target_doc_hints=self._target_doc_hints(
|
||||
endpoint_paths=terms.endpoint_paths,
|
||||
api_like_terms=terms.api_like_terms,
|
||||
alias_docs=terms.alias_docs,
|
||||
architecture_markers=markers["architecture_markers"],
|
||||
logic_markers=markers["logic_markers"],
|
||||
domain_markers=markers["domain_markers"],
|
||||
),
|
||||
matched_aliases=list(terms.matched_aliases),
|
||||
process_domain=None,
|
||||
process_subdomain=None,
|
||||
process_domain=process_domain,
|
||||
process_subdomain=process_subdomain,
|
||||
)
|
||||
return AnchorAnalysis(
|
||||
anchors=anchors,
|
||||
@@ -133,6 +194,7 @@ class V2AnchorExtractor:
|
||||
self,
|
||||
*,
|
||||
endpoint_paths: list[str],
|
||||
api_like_terms: list[str],
|
||||
alias_docs: list[str],
|
||||
architecture_markers: list[str],
|
||||
logic_markers: list[str],
|
||||
@@ -145,13 +207,41 @@ class V2AnchorExtractor:
|
||||
"/actions/{action}": "docs/api/control-actions-endpoint.md",
|
||||
}
|
||||
for endpoint in endpoint_paths:
|
||||
for hint in self._endpoint_hint_variants(endpoint):
|
||||
self._append_unique(hints, hint)
|
||||
hint = endpoint_map.get(endpoint)
|
||||
if hint and hint not in hints:
|
||||
hints.append(hint)
|
||||
if architecture_markers and "docs/architecture/telegram-notify-app-overview.md" not in hints:
|
||||
hints.append("docs/architecture/telegram-notify-app-overview.md")
|
||||
if logic_markers and "docs/logic/telegram-notification-loop.md" not in hints:
|
||||
hints.append("docs/logic/telegram-notification-loop.md")
|
||||
if domain_markers and "docs/domains/runtime-health-entity.md" not in hints:
|
||||
hints.append("docs/domains/runtime-health-entity.md")
|
||||
self._append_unique(hints, hint)
|
||||
for term in api_like_terms:
|
||||
for hint in self._api_like_hint_variants(term):
|
||||
self._append_unique(hints, hint)
|
||||
if architecture_markers:
|
||||
self._append_unique(hints, "docs/architecture/telegram-notify-app-overview.md")
|
||||
if logic_markers:
|
||||
self._append_unique(hints, "docs/logic/telegram-notification-loop.md")
|
||||
if domain_markers:
|
||||
self._append_unique(hints, "docs/domains/runtime-health-entity.md")
|
||||
return hints
|
||||
|
||||
def _endpoint_hint_variants(self, endpoint: str) -> list[str]:
|
||||
normalized = str(endpoint or "").strip().lower()
|
||||
if not normalized:
|
||||
return []
|
||||
slug = normalized.strip("/").replace("/", "-").replace("{", "").replace("}", "")
|
||||
leaf = next((part for part in reversed(slug.split("-")) if part and part != "id"), "")
|
||||
hints: list[str] = [normalized]
|
||||
for value in (slug, leaf):
|
||||
if not value:
|
||||
continue
|
||||
hints.extend([value, f"{value}-endpoint", f"{value} endpoint"])
|
||||
return list(dict.fromkeys(hints))
|
||||
|
||||
def _api_like_hint_variants(self, term: str) -> list[str]:
|
||||
normalized = str(term or "").strip().lower().lstrip("/")
|
||||
if not normalized:
|
||||
return []
|
||||
return [normalized, f"/{normalized}", f"{normalized}-endpoint", f"{normalized} endpoint"]
|
||||
|
||||
def _append_unique(self, items: list[str], value: str | None) -> None:
|
||||
normalized = str(value or "").strip()
|
||||
if normalized and normalized not in items:
|
||||
items.append(normalized)
|
||||
|
||||
@@ -8,6 +8,7 @@ from dataclasses import dataclass
|
||||
class TargetTermsAnalysis:
|
||||
target_terms: list[str]
|
||||
endpoint_paths: list[str]
|
||||
api_like_terms: list[str]
|
||||
matched_aliases: list[str]
|
||||
alias_docs: list[str]
|
||||
|
||||
@@ -26,7 +27,7 @@ class _AliasMatcher:
|
||||
_AliasRule(("control actions", "управление runtime"), "/actions/{action}", "docs/api/control-actions-endpoint.md"),
|
||||
_AliasRule(("runtime health", "здоровье runtime", "статусы здоровья"), "runtime_health", "docs/domains/runtime-health-entity.md"),
|
||||
_AliasRule(("цикл отправки уведомлений", "notification loop", "worker loop"), "telegram-notify-loop", "docs/logic/telegram-notification-loop.md"),
|
||||
_AliasRule(("архитектура приложения", "overview"), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
|
||||
_AliasRule(("архитектура приложения",), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
|
||||
_AliasRule(("архитектура",), "architecture_overview", "docs/architecture/telegram-notify-app-overview.md"),
|
||||
_AliasRule(("каталог ошибок", "errors catalog"), "errors_catalog", "docs/errors/catalog.yaml"),
|
||||
_AliasRule(("файл-индекс документации", "docs index", "индекс документации"), "docs_index", "docs/README.md"),
|
||||
@@ -51,6 +52,7 @@ class _AliasMatcher:
|
||||
class _EndpointPathExtractor:
|
||||
_PATH_RE = re.compile(r"`([^`]+)`|(/[A-Za-z0-9_./{}-]+)")
|
||||
_VALID_ENDPOINT_RE = re.compile(r"^/[a-z0-9._/-]+(?:/\{[a-z0-9_]+\})?$")
|
||||
_DOC_EXTENSIONS = (".md", ".yaml", ".yml", ".json")
|
||||
|
||||
def extract(self, query: str) -> list[str]:
|
||||
values: list[str] = []
|
||||
@@ -68,28 +70,161 @@ class _EndpointPathExtractor:
|
||||
return trimmed.lower()
|
||||
|
||||
def _is_endpoint(self, token: str) -> bool:
|
||||
return bool(token and self._VALID_ENDPOINT_RE.fullmatch(token))
|
||||
if not token or not self._VALID_ENDPOINT_RE.fullmatch(token):
|
||||
return False
|
||||
return not token.endswith(self._DOC_EXTENSIONS)
|
||||
|
||||
def _append_unique(self, items: list[str], value: str) -> None:
|
||||
if value and value not in items:
|
||||
items.append(value)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class _ApiLikeAnchorAnalysis:
|
||||
endpoint_paths: list[str]
|
||||
candidate_terms: list[str]
|
||||
|
||||
|
||||
class _ApiLikeAnchorExtractor:
|
||||
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-я0-9_./{}-]+")
|
||||
_ASCII_ENDPOINT_RE = re.compile(r"^[a-z0-9]+(?:[-_][a-z0-9]+)*$")
|
||||
_API_MARKERS = {
|
||||
"api",
|
||||
"endpoint",
|
||||
"route",
|
||||
"method",
|
||||
"метод",
|
||||
"метода",
|
||||
"методу",
|
||||
"ручка",
|
||||
"ручки",
|
||||
"эндпоинт",
|
||||
"эндпоинта",
|
||||
"маршрут",
|
||||
"роут",
|
||||
}
|
||||
_EXPLAIN_MARKERS = {
|
||||
"как",
|
||||
"что",
|
||||
"делает",
|
||||
"работает",
|
||||
"объясни",
|
||||
"объяснить",
|
||||
"расскажи",
|
||||
"опиши",
|
||||
"смысл",
|
||||
}
|
||||
_NOISE_WORDS = _API_MARKERS | _EXPLAIN_MARKERS | {
|
||||
"про",
|
||||
"какой",
|
||||
"какая",
|
||||
"какие",
|
||||
"какого",
|
||||
"какую",
|
||||
"кратко",
|
||||
"нужен",
|
||||
"нужно",
|
||||
"у",
|
||||
}
|
||||
_SHORT_QUERY_TOKEN_LIMIT = 7
|
||||
|
||||
def extract(self, query: str, explicit_endpoint_paths: list[str]) -> _ApiLikeAnchorAnalysis:
|
||||
if explicit_endpoint_paths:
|
||||
return _ApiLikeAnchorAnalysis(endpoint_paths=list(explicit_endpoint_paths), candidate_terms=[])
|
||||
token_entries = self._token_entries(query)
|
||||
if not token_entries:
|
||||
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
|
||||
candidate_terms = [token for token, _start in token_entries if self._is_api_candidate(token)]
|
||||
if not candidate_terms:
|
||||
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
|
||||
if self._has_api_marker(token_entries):
|
||||
primary = self._primary_candidate(token_entries)
|
||||
endpoint_paths = [self._ensure_endpoint(primary)] if primary else []
|
||||
return _ApiLikeAnchorAnalysis(
|
||||
endpoint_paths=[path for path in endpoint_paths if path],
|
||||
candidate_terms=[primary] if primary else [],
|
||||
)
|
||||
if self._is_short_explain_query(token_entries) and len(candidate_terms) == 1:
|
||||
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=list(candidate_terms))
|
||||
return _ApiLikeAnchorAnalysis(endpoint_paths=[], candidate_terms=[])
|
||||
|
||||
def _token_entries(self, query: str) -> list[tuple[str, int]]:
|
||||
entries: list[tuple[str, int]] = []
|
||||
for match in self._TOKEN_RE.finditer(query):
|
||||
token = str(match.group(0) or "").strip().strip("`'\"()[]!?.,:;").lower()
|
||||
if token:
|
||||
entries.append((token, match.start()))
|
||||
return entries
|
||||
|
||||
def _has_api_marker(self, token_entries: list[tuple[str, int]]) -> bool:
|
||||
return any(token in self._API_MARKERS for token, _start in token_entries)
|
||||
|
||||
def _is_short_explain_query(self, token_entries: list[tuple[str, int]]) -> bool:
|
||||
if len(token_entries) > self._SHORT_QUERY_TOKEN_LIMIT:
|
||||
return False
|
||||
return any(token in self._EXPLAIN_MARKERS for token, _start in token_entries)
|
||||
|
||||
def _primary_candidate(self, token_entries: list[tuple[str, int]]) -> str | None:
|
||||
marker_positions = [start for token, start in token_entries if token in self._API_MARKERS]
|
||||
candidates = [(token, start) for token, start in token_entries if self._is_api_candidate(token)]
|
||||
if not candidates:
|
||||
return None
|
||||
if not marker_positions:
|
||||
return candidates[-1][0]
|
||||
primary = min(
|
||||
candidates,
|
||||
key=lambda item: min(abs(item[1] - marker_pos) for marker_pos in marker_positions),
|
||||
)
|
||||
return primary[0]
|
||||
|
||||
def _is_api_candidate(self, token: str) -> bool:
|
||||
if (
|
||||
not token
|
||||
or token in self._NOISE_WORDS
|
||||
or token.startswith("docs/")
|
||||
or token.endswith((".md", ".yaml", ".yml", ".json"))
|
||||
):
|
||||
return False
|
||||
if token.startswith("/"):
|
||||
return True
|
||||
return self._ASCII_ENDPOINT_RE.fullmatch(token) is not None and len(token) >= 3
|
||||
|
||||
def _ensure_endpoint(self, token: str) -> str:
|
||||
return token if token.startswith("/") else f"/{token}"
|
||||
|
||||
|
||||
class _TermCollector:
|
||||
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-я0-9_./{}-]+")
|
||||
_IDENTIFIER_RE = re.compile(
|
||||
r"^(?:[a-z0-9]+(?:[_-][a-z0-9]+)+|[a-z]+[A-Z][A-Za-z0-9]+|(?:[A-Z][a-z0-9]+){2,})$"
|
||||
)
|
||||
_QUESTION_WORDS = {"что", "как", "где", "какой", "какие", "каком", "когда", "чего"}
|
||||
_INTENT_WORDS = {"объясни", "покажи", "найди", "расскажи", "дай", "опиши", "нужен"}
|
||||
_FILLER_WORDS = {"про", "там", "тут", "плз"}
|
||||
_INTENT_WORDS = {"объясни", "покажи", "найди", "расскажи", "дай", "опиши", "нужен", "show"}
|
||||
_FILLER_WORDS = {"про", "там", "тут", "плз", "pls", "for"}
|
||||
_MARKER_WORDS = {
|
||||
"файл",
|
||||
"файле",
|
||||
"file",
|
||||
"method",
|
||||
"метод",
|
||||
"метода",
|
||||
"методу",
|
||||
"route",
|
||||
"ручка",
|
||||
"ручки",
|
||||
"эндпоинт",
|
||||
"эндпоинта",
|
||||
"overview",
|
||||
"architecture",
|
||||
"arch",
|
||||
"flow",
|
||||
"process",
|
||||
"workflow",
|
||||
"док",
|
||||
"дока",
|
||||
"доках",
|
||||
"документ",
|
||||
"doc",
|
||||
"описан",
|
||||
"док-саммари",
|
||||
"summary",
|
||||
@@ -115,6 +250,7 @@ class _TermCollector:
|
||||
"service",
|
||||
"summary",
|
||||
"endpoint",
|
||||
"docs",
|
||||
}
|
||||
_MAX_TERMS = 7
|
||||
|
||||
@@ -191,19 +327,23 @@ class V2TargetTermsExtractor:
|
||||
self,
|
||||
alias_matcher: _AliasMatcher | None = None,
|
||||
endpoint_extractor: _EndpointPathExtractor | None = None,
|
||||
api_like_extractor: _ApiLikeAnchorExtractor | None = None,
|
||||
term_collector: _TermCollector | None = None,
|
||||
) -> None:
|
||||
self._alias_matcher = alias_matcher or _AliasMatcher()
|
||||
self._endpoint_extractor = endpoint_extractor or _EndpointPathExtractor()
|
||||
self._api_like_extractor = api_like_extractor or _ApiLikeAnchorExtractor()
|
||||
self._term_collector = term_collector or _TermCollector()
|
||||
|
||||
def extract(self, normalized_query: str) -> TargetTermsAnalysis:
|
||||
lowered = normalized_query.lower()
|
||||
endpoint_paths = self._endpoint_extractor.extract(normalized_query)
|
||||
api_like = self._api_like_extractor.extract(normalized_query, endpoint_paths)
|
||||
alias_terms, alias_docs, alias_hits = self._alias_matcher.match(lowered)
|
||||
return TargetTermsAnalysis(
|
||||
target_terms=self._term_collector.collect(normalized_query, alias_terms, endpoint_paths),
|
||||
endpoint_paths=endpoint_paths,
|
||||
target_terms=self._term_collector.collect(normalized_query, alias_terms, api_like.endpoint_paths),
|
||||
endpoint_paths=api_like.endpoint_paths,
|
||||
api_like_terms=api_like.candidate_terms,
|
||||
matched_aliases=alias_hits,
|
||||
alias_docs=alias_docs,
|
||||
)
|
||||
|
||||
@@ -44,6 +44,7 @@ class V2IntentRouter:
|
||||
normalized_query=normalized_query,
|
||||
target_terms=list(target_terms_analysis.target_terms),
|
||||
endpoint_paths=list(target_terms_analysis.endpoint_paths),
|
||||
file_names=list(anchor_analysis.anchors.file_names),
|
||||
matched_aliases=list(target_terms_analysis.matched_aliases),
|
||||
target_doc_hints=list(anchor_analysis.anchors.target_doc_hints),
|
||||
file_markers=list(anchor_analysis.file_markers),
|
||||
@@ -58,6 +59,7 @@ class V2IntentRouter:
|
||||
anchors=anchor_analysis.anchors,
|
||||
)
|
||||
llm_result = self._validator.validate(llm_candidate)
|
||||
llm_result = self._apply_deterministic_corrections(llm_result, features)
|
||||
if llm_result is not None:
|
||||
confidence = self._confidence_adjuster.adjust(float(llm_result["confidence"]), features)
|
||||
return V2RouteResult(
|
||||
@@ -99,3 +101,18 @@ class V2IntentRouter:
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _apply_deterministic_corrections(self, candidate: dict | None, features: QueryFeatures) -> dict | None:
|
||||
if candidate is None:
|
||||
return None
|
||||
if candidate.get("routing_domain") == "DOCS" and self._should_force_find_files(features):
|
||||
corrected = dict(candidate)
|
||||
corrected["subintent"] = "FIND_FILES"
|
||||
return corrected
|
||||
return candidate
|
||||
|
||||
def _should_force_find_files(self, features: QueryFeatures) -> bool:
|
||||
if features.file_markers or features.file_names:
|
||||
return True
|
||||
query = features.normalized_query.lower()
|
||||
return "show doc" in query or "show file" in query or "doc for" in query
|
||||
|
||||
@@ -6,7 +6,7 @@ from app.core.agent.processes.v2.models import V2Subintent
|
||||
|
||||
class DocsSubintentResolver:
|
||||
def resolve(self, features: QueryFeatures) -> str | None:
|
||||
if features.file_markers:
|
||||
if features.file_markers or self._has_file_like_anchor(features):
|
||||
return V2Subintent.FIND_FILES
|
||||
if any(
|
||||
(
|
||||
@@ -20,3 +20,9 @@ class DocsSubintentResolver:
|
||||
):
|
||||
return V2Subintent.SUMMARY
|
||||
return None
|
||||
|
||||
def _has_file_like_anchor(self, features: QueryFeatures) -> bool:
|
||||
return any(
|
||||
hint.endswith((".md", ".yaml", ".yml", ".json"))
|
||||
for hint in features.target_doc_hints
|
||||
) or any(token.endswith((".md", ".yaml", ".yml", ".json")) for token in features.file_names)
|
||||
|
||||
@@ -14,7 +14,6 @@ from app.core.agent.processes.v2.retrieval.target_doc_seeding import (
|
||||
merge_row_lists,
|
||||
normalize_doc_path,
|
||||
normalized_path_set,
|
||||
path_variants_for_rag_query,
|
||||
row_path,
|
||||
seed_candidates_from_target_hints,
|
||||
)
|
||||
@@ -121,11 +120,9 @@ class V2Process(AgentProcess):
|
||||
"retrieval_profile_selected",
|
||||
{"profile": plan.profile, "layers": plan.layers, "filters": plan.filters},
|
||||
)
|
||||
seeded_rows = await self._seed_candidates_from_target_hints(rag_session_id, plan.layers, route)
|
||||
semantic_rows = await self._rag_adapter.fetch_rows(rag_session_id, route.normalized_query, plan)
|
||||
metadata_rows = self._metadata_lookup_candidates([*seeded_rows, *semantic_rows], route)
|
||||
rows = self._merge_candidate_rows(seeded_rows, metadata_rows, semantic_rows)
|
||||
rows = await self._ensure_target_hints_in_pool(rag_session_id, rows, route)
|
||||
retrieved_rows = await self._rag_adapter.fetch_rows(rag_session_id, route.normalized_query, plan)
|
||||
metadata_rows = self._metadata_lookup_candidates(retrieved_rows, route)
|
||||
rows = self._merge_candidate_rows(retrieved_rows, metadata_rows)
|
||||
rows = seed_candidates_from_target_hints(rows, route.anchors.target_doc_hints, RagRowIndex(rows))
|
||||
self._print_missing_target_hints(route, rows)
|
||||
context.trace.module("process.v2.rag_retrieval").log(
|
||||
@@ -150,9 +147,9 @@ class V2Process(AgentProcess):
|
||||
"target_doc_hints": route.anchors.target_doc_hints,
|
||||
"candidate_docs_before_ranking": [self._trace_row(row) for row in rows[:8]],
|
||||
"sources": {
|
||||
"seeded": [self._trace_row(row) for row in seeded_rows[:5]],
|
||||
"seeded": [self._trace_row(row) for row in retrieved_rows[:5] if row_path(row) in {normalize_doc_path(h) for h in route.anchors.target_doc_hints}],
|
||||
"metadata_lookup": [self._trace_row(row) for row in metadata_rows[:5]],
|
||||
"semantic": [self._trace_row(row) for row in semantic_rows[:5]],
|
||||
"semantic": [self._trace_row(row) for row in retrieved_rows[:5]],
|
||||
},
|
||||
},
|
||||
)
|
||||
@@ -262,61 +259,11 @@ class V2Process(AgentProcess):
|
||||
if not str(hint or "").strip():
|
||||
continue
|
||||
normalized = normalize_doc_path(hint)
|
||||
if not normalized.startswith("docs/") or "." not in normalized.rsplit("/", 1)[-1]:
|
||||
continue
|
||||
if normalized not in candidate_paths:
|
||||
print("ERROR: target doc missing from candidates:", normalized)
|
||||
|
||||
async def _ensure_target_hints_in_pool(self, rag_session_id: str, rows: list[dict], route) -> list[dict]:
|
||||
hints_raw = [str(item).strip() for item in route.anchors.target_doc_hints if str(item or "").strip()]
|
||||
if not hints_raw:
|
||||
return rows
|
||||
pool = normalized_path_set(rows)
|
||||
missing_hints = [h for h in hints_raw if normalize_doc_path(h) not in pool]
|
||||
if not missing_hints:
|
||||
return rows
|
||||
variant_paths: list[str] = []
|
||||
for h in missing_hints:
|
||||
variant_paths.extend(path_variants_for_rag_query(h))
|
||||
variant_paths = list(dict.fromkeys(variant_paths))
|
||||
extra_exact = await self._rag_adapter.fetch_exact_paths(rag_session_id, paths=variant_paths, layers=None)
|
||||
pool2 = normalized_path_set(extra_exact)
|
||||
still_missing = [h for h in missing_hints if normalize_doc_path(h) not in pool2]
|
||||
fallback_rows: list[dict] = []
|
||||
if still_missing:
|
||||
needles = [normalize_doc_path(h).split("/")[-1] for h in still_missing]
|
||||
needles = list(dict.fromkeys(n for n in needles if n))
|
||||
if needles:
|
||||
fallback_rows = await self._rag_adapter.fetch_chunks_by_path_substrings(
|
||||
rag_session_id,
|
||||
path_needles=needles,
|
||||
layers=None,
|
||||
)
|
||||
return merge_row_lists(rows, extra_exact, fallback_rows)
|
||||
|
||||
async def _seed_candidates_from_target_hints(self, rag_session_id: str, layers: list[str], route) -> list[dict]:
|
||||
del layers # seed по пути должен видеть все слои (иначе D0-only чанки теряются при file_lookup).
|
||||
hints_raw = [str(item).strip() for item in route.anchors.target_doc_hints if str(item or "").strip()]
|
||||
if not hints_raw:
|
||||
return []
|
||||
variant_paths: list[str] = []
|
||||
for h in hints_raw:
|
||||
variant_paths.extend(path_variants_for_rag_query(h))
|
||||
variant_paths = list(dict.fromkeys(variant_paths))
|
||||
exact_rows = await self._rag_adapter.fetch_exact_paths(rag_session_id, paths=variant_paths, layers=None)
|
||||
paths_found = normalized_path_set(exact_rows)
|
||||
missing = [h for h in hints_raw if normalize_doc_path(h) not in paths_found]
|
||||
if not missing:
|
||||
return exact_rows
|
||||
needles = [normalize_doc_path(h).split("/")[-1] for h in missing]
|
||||
needles = list(dict.fromkeys(n for n in needles if n))
|
||||
if not needles:
|
||||
return exact_rows
|
||||
fallback_rows = await self._rag_adapter.fetch_chunks_by_path_substrings(
|
||||
rag_session_id,
|
||||
path_needles=needles,
|
||||
layers=None,
|
||||
)
|
||||
return merge_row_lists(exact_rows, fallback_rows)
|
||||
|
||||
def _metadata_lookup_candidates(self, rows: list[dict], route) -> list[dict]:
|
||||
return DocsMetadataLookupIndex(rows).lookup(route)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Intent-aware retrieval policy resolver для процесса v2."""
|
||||
"""Intent-aware retrieval policy resolver for process v2."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -8,91 +8,113 @@ from app.core.rag.contracts.enums import RagLayer
|
||||
from app.core.rag.retrieval.session_retriever import RetrievalPlan
|
||||
|
||||
|
||||
class V2RetrievalPolicyResolver:
|
||||
_SUMMARY_LAYERS = [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_ENTITY_CATALOG,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
]
|
||||
_GENERAL_LAYERS = [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
class _AnchorTermCollector:
|
||||
def prefer_like_patterns(self, route: V2RouteResult) -> list[str]:
|
||||
terms = self._hint_basenames(route)
|
||||
terms.extend(route.anchors.endpoint_paths)
|
||||
terms.extend(route.target_terms)
|
||||
terms.extend(route.anchors.file_names)
|
||||
terms.extend(route.anchors.entity_names)
|
||||
terms.extend(route.anchors.matched_aliases)
|
||||
terms.extend(self._process_terms(route))
|
||||
return [f"%{term.lower()}%" for term in _unique_terms(terms)]
|
||||
|
||||
def find_files_patterns(self, route: V2RouteResult) -> list[str]:
|
||||
if route.anchors.target_doc_hints:
|
||||
return [f"%{name.lower()}%" for name in self._hint_basenames(route)]
|
||||
return self.prefer_like_patterns(route)
|
||||
|
||||
def api_method_patterns(self, route: V2RouteResult) -> list[str]:
|
||||
terms = self._hint_basenames(route)
|
||||
terms.extend(route.anchors.target_doc_hints)
|
||||
terms.extend(route.anchors.endpoint_paths)
|
||||
terms.extend(route.target_terms)
|
||||
patterns: list[str] = []
|
||||
for term in _unique_terms(terms):
|
||||
lowered = term.lower()
|
||||
stripped = lowered.strip("/")
|
||||
if stripped:
|
||||
patterns.append(f"%{stripped}%")
|
||||
if lowered:
|
||||
patterns.append(f"%{lowered}%")
|
||||
return _unique_terms(patterns)
|
||||
|
||||
def _hint_basenames(self, route: V2RouteResult) -> list[str]:
|
||||
return [hint.rsplit("/", 1)[-1] for hint in route.anchors.target_doc_hints if str(hint).strip()]
|
||||
|
||||
def _process_terms(self, route: V2RouteResult) -> list[str]:
|
||||
terms: list[str] = []
|
||||
if route.anchors.process_domain:
|
||||
terms.append(route.anchors.process_domain)
|
||||
if route.anchors.process_subdomain:
|
||||
terms.append(route.anchors.process_subdomain)
|
||||
return terms
|
||||
|
||||
|
||||
class _RouteFilterBuilder:
|
||||
_API_DOC_PREFIXES = [
|
||||
"docs/api/",
|
||||
"docs/endpoints/",
|
||||
"docs/methods/",
|
||||
"api/",
|
||||
"endpoints/",
|
||||
"methods/",
|
||||
]
|
||||
|
||||
def resolve(self, route: V2RouteResult) -> RetrievalPlan:
|
||||
if route.intent == V2Intent.GENERAL_QA:
|
||||
return RetrievalPlan(
|
||||
profile="general_qa_grounded_summary",
|
||||
layers=list(self._GENERAL_LAYERS),
|
||||
limit=8,
|
||||
filters=self._general_filters(route),
|
||||
)
|
||||
if route.subintent == V2Subintent.FIND_FILES:
|
||||
return RetrievalPlan(
|
||||
profile="file_lookup",
|
||||
layers=[RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_ENTITY_CATALOG],
|
||||
limit=12,
|
||||
filters=self._find_files_filters(route),
|
||||
)
|
||||
return RetrievalPlan(
|
||||
profile=self._summary_profile(route),
|
||||
layers=list(self._SUMMARY_LAYERS),
|
||||
limit=8,
|
||||
filters=self._summary_filters(route),
|
||||
)
|
||||
def __init__(self) -> None:
|
||||
self._terms = _AnchorTermCollector()
|
||||
|
||||
def _summary_profile(self, route: V2RouteResult) -> str:
|
||||
signals = anchor_signal_types(route)
|
||||
if len(signals - {V2AnchorType.FIND_FILES}) != 1:
|
||||
return "docs_summary_generic"
|
||||
mapping = {
|
||||
V2AnchorType.API_ENDPOINT: "docs_summary_api_endpoint",
|
||||
V2AnchorType.ARCHITECTURE: "docs_summary_architecture",
|
||||
V2AnchorType.LOGIC_FLOW: "docs_summary_logic_flow",
|
||||
V2AnchorType.DOMAIN_ENTITY: "docs_summary_domain_entity",
|
||||
}
|
||||
signal = next(iter(signals - {V2AnchorType.FIND_FILES}), None)
|
||||
return mapping.get(signal, "docs_summary_generic")
|
||||
|
||||
def _general_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
def general_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
return {
|
||||
"prefer_path_prefixes": ["docs/architecture/", "docs/"],
|
||||
"prefer_like_patterns": ["%README.md%", "%overview%"],
|
||||
"prefer_like_patterns": ["%readme.md%", "%overview%"],
|
||||
"target_doc_hints": list(route.anchors.target_doc_hints),
|
||||
}
|
||||
|
||||
def _summary_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
filters: dict[str, object] = {
|
||||
"prefer_path_prefixes": self._summary_prefixes(route),
|
||||
"prefer_like_patterns": self._prefer_like_patterns(route),
|
||||
"target_doc_hints": list(route.anchors.target_doc_hints),
|
||||
}
|
||||
def summary_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
if _is_api_method_explain(route):
|
||||
return self.api_method_filters(route)
|
||||
filters = self._base_filters(route)
|
||||
filters["prefer_path_prefixes"] = self._summary_prefixes(route)
|
||||
filters["prefer_like_patterns"] = self._terms.prefer_like_patterns(route)
|
||||
if V2AnchorType.API_ENDPOINT in anchor_signal_types(route):
|
||||
filters["path_prefixes"] = ["docs/api/", "docs/architecture/", "docs/"]
|
||||
filters["path_prefixes"] = ["docs/api/", "docs/"]
|
||||
return filters
|
||||
|
||||
def _find_files_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
def api_method_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
filters = self._base_filters(route)
|
||||
filters["path_prefixes"] = list(self._API_DOC_PREFIXES)
|
||||
filters["prefer_path_prefixes"] = list(self._API_DOC_PREFIXES)
|
||||
filters["prefer_like_patterns"] = self._terms.api_method_patterns(route)
|
||||
return filters
|
||||
|
||||
def find_files_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
filters = self._base_filters(route)
|
||||
prefixes = self._find_files_prefixes(route)
|
||||
if prefixes:
|
||||
filters["path_prefixes"] = prefixes
|
||||
filters["prefer_path_prefixes"] = self._find_files_prefer_prefixes(route, prefixes)
|
||||
filters["prefer_like_patterns"] = self._terms.find_files_patterns(route)
|
||||
return filters
|
||||
|
||||
def _base_filters(self, route: V2RouteResult) -> dict[str, object]:
|
||||
filters: dict[str, object] = {
|
||||
"prefer_path_prefixes": self._find_files_prefixes(route),
|
||||
"prefer_like_patterns": self._prefer_like_patterns(route),
|
||||
"target_doc_hints": list(route.anchors.target_doc_hints),
|
||||
}
|
||||
if route.anchors.target_doc_hints:
|
||||
filters["prefer_like_patterns"] = [f"%{path.split('/')[-1]}%" for path in route.anchors.target_doc_hints]
|
||||
if route.anchors.process_domain:
|
||||
filters["metadata.domain"] = route.anchors.process_domain
|
||||
if route.anchors.process_subdomain:
|
||||
filters["metadata.subdomain"] = route.anchors.process_subdomain
|
||||
return filters
|
||||
|
||||
def _prefer_like_patterns(self, route: V2RouteResult) -> list[str]:
|
||||
patterns: list[str] = []
|
||||
for path in route.anchors.target_doc_hints:
|
||||
patterns.append(f"%{path.split('/')[-1]}%")
|
||||
for endpoint in route.anchors.endpoint_paths:
|
||||
patterns.append(f"%{endpoint}%")
|
||||
return patterns
|
||||
|
||||
def _find_files_prefixes(self, route: V2RouteResult) -> list[str]:
|
||||
if route.anchors.target_doc_hints:
|
||||
prefixes = ["/".join(path.split("/")[:-1]) + "/" for path in route.anchors.target_doc_hints]
|
||||
return [prefix for prefix in prefixes if prefix]
|
||||
hint_prefixes = _prefixes_from_paths(route.anchors.target_doc_hints)
|
||||
if hint_prefixes:
|
||||
return hint_prefixes
|
||||
file_prefixes = [name for name in route.anchors.file_names if str(name).strip().startswith("docs/")]
|
||||
derived = _prefixes_from_paths(file_prefixes)
|
||||
if derived:
|
||||
return derived
|
||||
signals = anchor_signal_types(route)
|
||||
if V2AnchorType.API_ENDPOINT in signals:
|
||||
return ["docs/api/", "docs/"]
|
||||
@@ -104,6 +126,12 @@ class V2RetrievalPolicyResolver:
|
||||
return ["docs/domains/", "docs/"]
|
||||
return ["docs/"]
|
||||
|
||||
def _find_files_prefer_prefixes(self, route: V2RouteResult, prefixes: list[str]) -> list[str]:
|
||||
preferred = list(prefixes)
|
||||
if route.anchors.process_domain or route.anchors.process_subdomain:
|
||||
preferred.extend(["docs/domains/", "docs/logic/"])
|
||||
return _unique_terms(preferred or ["docs/"])
|
||||
|
||||
def _summary_prefixes(self, route: V2RouteResult) -> list[str]:
|
||||
signals = anchor_signal_types(route)
|
||||
prefixes: list[str] = []
|
||||
@@ -114,5 +142,129 @@ class V2RetrievalPolicyResolver:
|
||||
if V2AnchorType.LOGIC_FLOW in signals:
|
||||
prefixes.extend(["docs/logic/", "docs/architecture/", "docs/"])
|
||||
if V2AnchorType.DOMAIN_ENTITY in signals:
|
||||
prefixes.extend(["docs/domains/", "docs/api/", "docs/architecture/"])
|
||||
return list(dict.fromkeys(prefixes or ["docs/"]))
|
||||
prefixes.extend(["docs/domains/", "docs/", "docs/api/"])
|
||||
return _unique_terms(prefixes or ["docs/"])
|
||||
|
||||
|
||||
class V2RetrievalPolicyResolver:
|
||||
_GENERAL_LAYERS = [RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_DOC_CHUNKS]
|
||||
_FIND_FILES_LAYERS = [RagLayer.DOCS_DOCUMENT_CATALOG, RagLayer.DOCS_ENTITY_CATALOG]
|
||||
_SUMMARY_LAYERS = {
|
||||
"docs_api_method_explain": [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_FACT_INDEX,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
"docs_summary_api_endpoint": [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_FACT_INDEX,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
"docs_summary_logic_flow": [
|
||||
RagLayer.DOCS_WORKFLOW_INDEX,
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
"docs_summary_domain_entity": [
|
||||
RagLayer.DOCS_ENTITY_CATALOG,
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
"docs_summary_architecture": [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_RELATION_GRAPH,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
"docs_summary_generic": [
|
||||
RagLayer.DOCS_DOCUMENT_CATALOG,
|
||||
RagLayer.DOCS_DOC_CHUNKS,
|
||||
],
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._filters = _RouteFilterBuilder()
|
||||
|
||||
def resolve(self, route: V2RouteResult) -> RetrievalPlan:
|
||||
if route.intent == V2Intent.GENERAL_QA:
|
||||
return RetrievalPlan(
|
||||
profile="general_qa_grounded_summary",
|
||||
layers=list(self._GENERAL_LAYERS),
|
||||
limit=8,
|
||||
filters=self._filters.general_filters(route),
|
||||
)
|
||||
if route.subintent == V2Subintent.FIND_FILES:
|
||||
return RetrievalPlan(
|
||||
profile="file_lookup",
|
||||
layers=list(self._FIND_FILES_LAYERS),
|
||||
limit=12,
|
||||
filters=self._filters.find_files_filters(route),
|
||||
)
|
||||
profile = self._summary_profile(route)
|
||||
return RetrievalPlan(
|
||||
profile=profile,
|
||||
layers=list(self._SUMMARY_LAYERS[profile]),
|
||||
limit=10 if profile == "docs_api_method_explain" else 8,
|
||||
filters=self._filters.summary_filters(route),
|
||||
)
|
||||
|
||||
def _summary_profile(self, route: V2RouteResult) -> str:
|
||||
if _is_api_method_explain(route):
|
||||
return "docs_api_method_explain"
|
||||
meaningful = anchor_signal_types(route) - {V2AnchorType.FIND_FILES}
|
||||
if len(meaningful) != 1:
|
||||
return "docs_summary_generic"
|
||||
mapping = {
|
||||
V2AnchorType.API_ENDPOINT: "docs_summary_api_endpoint",
|
||||
V2AnchorType.ARCHITECTURE: "docs_summary_architecture",
|
||||
V2AnchorType.LOGIC_FLOW: "docs_summary_logic_flow",
|
||||
V2AnchorType.DOMAIN_ENTITY: "docs_summary_domain_entity",
|
||||
}
|
||||
return mapping.get(next(iter(meaningful)), "docs_summary_generic")
|
||||
|
||||
|
||||
def _prefixes_from_paths(paths: list[str]) -> list[str]:
|
||||
prefixes = []
|
||||
for path in paths:
|
||||
value = str(path).strip().strip("/")
|
||||
if "/" not in value:
|
||||
continue
|
||||
prefix = value.rsplit("/", 1)[0] + "/"
|
||||
if prefix:
|
||||
prefixes.append(prefix)
|
||||
return _unique_terms(prefixes)
|
||||
|
||||
|
||||
def _unique_terms(items: list[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
unique: list[str] = []
|
||||
for raw in items:
|
||||
value = str(raw or "").strip()
|
||||
if not value or value in seen:
|
||||
continue
|
||||
seen.add(value)
|
||||
unique.append(value)
|
||||
return unique
|
||||
|
||||
|
||||
def _is_api_method_explain(route: V2RouteResult) -> bool:
|
||||
if route.subintent != V2Subintent.SUMMARY:
|
||||
return False
|
||||
if route.anchors.endpoint_paths:
|
||||
return True
|
||||
if _has_api_like_hints(route.anchors.target_doc_hints):
|
||||
return True
|
||||
return V2AnchorType.API_ENDPOINT in anchor_signal_types(route)
|
||||
|
||||
|
||||
def _has_api_like_hints(hints: list[str]) -> bool:
|
||||
for hint in hints:
|
||||
value = str(hint or "").strip().lower()
|
||||
if not value:
|
||||
continue
|
||||
if value.startswith("/"):
|
||||
return True
|
||||
if value.startswith(("docs/api/", "docs/endpoints/", "docs/methods/")):
|
||||
return True
|
||||
if "endpoint" in value or "method" in value:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
"""Адаптер v2 к :class:`RagSessionRetriever` для подстановки в тестах."""
|
||||
"""Адаптер v2 к :class:`RagSessionRetriever` с plan-driven execution strategy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.core.agent.processes.v2.retrieval.target_doc_seeding import (
|
||||
merge_row_lists,
|
||||
normalize_doc_path,
|
||||
path_variants_for_rag_query,
|
||||
)
|
||||
from app.core.rag.retrieval.session_retriever import RagSessionRetriever, RetrievalPlan
|
||||
|
||||
|
||||
class V2RagRetrievalAdapter:
|
||||
"""Обёртка над :class:`RagSessionRetriever` для подмены в тестах."""
|
||||
|
||||
class _PlanDrivenRetrieval:
|
||||
def __init__(self, retriever: RagSessionRetriever) -> None:
|
||||
self._retriever = retriever
|
||||
|
||||
async def fetch_rows(self, rag_session_id: str, query_text: str, plan: RetrievalPlan) -> list[dict]:
|
||||
return await self._retriever.retrieve(rag_session_id, query_text, plan)
|
||||
seeded_rows = await self._seed_from_target_hints(rag_session_id, plan)
|
||||
semantic_rows = await self._retriever.retrieve(rag_session_id, query_text, plan)
|
||||
return merge_row_lists(seeded_rows, semantic_rows)
|
||||
|
||||
async def fetch_exact_paths(self, rag_session_id: str, *, paths: list[str], layers: list[str] | None = None) -> list[dict]:
|
||||
return await self._retriever.retrieve_exact_files(rag_session_id, paths=paths, layers=layers)
|
||||
@@ -31,3 +36,73 @@ class V2RagRetrievalAdapter:
|
||||
layers=layers,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
async def _seed_from_target_hints(self, rag_session_id: str, plan: RetrievalPlan) -> list[dict]:
|
||||
hints = self._target_doc_hints(plan)
|
||||
if not hints:
|
||||
return []
|
||||
exact_rows = await self._fetch_exact_rows(rag_session_id, hints)
|
||||
missing = self._missing_hints(hints, exact_rows)
|
||||
if not missing:
|
||||
return exact_rows
|
||||
fallback_rows = await self._fetch_substring_rows(rag_session_id, missing)
|
||||
return merge_row_lists(exact_rows, fallback_rows)
|
||||
|
||||
async def _fetch_exact_rows(self, rag_session_id: str, hints: list[str]) -> list[dict]:
|
||||
variant_paths: list[str] = []
|
||||
for hint in hints:
|
||||
variant_paths.extend(path_variants_for_rag_query(hint))
|
||||
unique_paths = list(dict.fromkeys(path for path in variant_paths if path))
|
||||
if not unique_paths:
|
||||
return []
|
||||
return await self._retriever.retrieve_exact_files(rag_session_id, paths=unique_paths, layers=None)
|
||||
|
||||
async def _fetch_substring_rows(self, rag_session_id: str, hints: list[str]) -> list[dict]:
|
||||
needles = [normalize_doc_path(hint).split("/")[-1] for hint in hints]
|
||||
unique_needles = list(dict.fromkeys(needle for needle in needles if needle))
|
||||
if not unique_needles:
|
||||
return []
|
||||
return await self._retriever.retrieve_chunks_by_path_substrings(
|
||||
rag_session_id,
|
||||
path_needles=unique_needles,
|
||||
layers=None,
|
||||
limit=200,
|
||||
)
|
||||
|
||||
def _target_doc_hints(self, plan: RetrievalPlan) -> list[str]:
|
||||
raw = plan.filters.get("target_doc_hints")
|
||||
if not isinstance(raw, list):
|
||||
return []
|
||||
return [str(item).strip() for item in raw if str(item or "").strip()]
|
||||
|
||||
def _missing_hints(self, hints: list[str], rows: list[dict]) -> list[str]:
|
||||
pool = {normalize_doc_path(str(row.get("path") or "")) for row in rows}
|
||||
return [hint for hint in hints if normalize_doc_path(hint) not in pool]
|
||||
|
||||
|
||||
class V2RagRetrievalAdapter:
|
||||
"""Обёртка над :class:`RagSessionRetriever` для plan-driven retrieval и подмены в тестах."""
|
||||
|
||||
def __init__(self, retriever: RagSessionRetriever) -> None:
|
||||
self._retriever = _PlanDrivenRetrieval(retriever)
|
||||
|
||||
async def fetch_rows(self, rag_session_id: str, query_text: str, plan: RetrievalPlan) -> list[dict]:
|
||||
return await self._retriever.fetch_rows(rag_session_id, query_text, plan)
|
||||
|
||||
async def fetch_exact_paths(self, rag_session_id: str, *, paths: list[str], layers: list[str] | None = None) -> list[dict]:
|
||||
return await self._retriever.fetch_exact_paths(rag_session_id, paths=paths, layers=layers)
|
||||
|
||||
async def fetch_chunks_by_path_substrings(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
*,
|
||||
path_needles: list[str],
|
||||
layers: list[str] | None = None,
|
||||
limit: int = 200,
|
||||
) -> list[dict]:
|
||||
return await self._retriever.fetch_chunks_by_path_substrings(
|
||||
rag_session_id,
|
||||
path_needles=path_needles,
|
||||
layers=layers,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
@@ -1,20 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import yaml
|
||||
|
||||
from app.core.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
|
||||
from app.core.rag.indexing.docs.models import IntegrationRecord
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocsIntegrationExtractor:
|
||||
_SECTION_TITLES = {"integrations", "интеграции"}
|
||||
|
||||
def extract(self, sections: list[SectionChunk]) -> list[IntegrationRecord]:
|
||||
def extract(self, sections: list[SectionChunk], *, path: str = "") -> list[IntegrationRecord]:
|
||||
records: list[IntegrationRecord] = []
|
||||
for section in sections:
|
||||
if not self._is_integration_section(section.section_path):
|
||||
continue
|
||||
payload = self._payload(section.content)
|
||||
payload = self._payload(section.content, path=path, section_path=section.section_path)
|
||||
target = str(payload.get("target") or "").strip()
|
||||
if not target:
|
||||
continue
|
||||
@@ -40,7 +44,7 @@ class DocsIntegrationExtractor:
|
||||
parts = [item.strip().lower() for item in section_path.split(" > ") if item.strip()]
|
||||
return any(part in self._SECTION_TITLES for part in parts[:-1]) or (parts and parts[-1] in self._SECTION_TITLES)
|
||||
|
||||
def _payload(self, text: str) -> dict:
|
||||
def _payload(self, text: str, *, path: str, section_path: str) -> dict:
|
||||
payload: dict = {}
|
||||
details_lines: list[str] = []
|
||||
collecting_details = False
|
||||
@@ -61,15 +65,27 @@ class DocsIntegrationExtractor:
|
||||
collecting_details = True
|
||||
details_lines = []
|
||||
if value:
|
||||
payload[key] = self._yaml_value(value)
|
||||
payload[key] = self._yaml_value(
|
||||
value,
|
||||
path=path,
|
||||
section_path=section_path,
|
||||
field_name=key,
|
||||
fallback="",
|
||||
)
|
||||
continue
|
||||
collecting_details = False
|
||||
payload[key] = self._yaml_value(value)
|
||||
payload[key] = self._yaml_value(
|
||||
value,
|
||||
path=path,
|
||||
section_path=section_path,
|
||||
field_name=key,
|
||||
fallback=value,
|
||||
)
|
||||
if details_lines:
|
||||
payload["details"] = self._details_payload(details_lines)
|
||||
payload["details"] = self._details_payload(details_lines, path=path, section_path=section_path)
|
||||
return payload
|
||||
|
||||
def _details_payload(self, lines: list[str]) -> dict:
|
||||
def _details_payload(self, lines: list[str], *, path: str, section_path: str) -> dict:
|
||||
normalized: list[str] = []
|
||||
for raw_line in lines:
|
||||
line = raw_line[2:] if raw_line.startswith(" ") else raw_line
|
||||
@@ -78,7 +94,13 @@ class DocsIntegrationExtractor:
|
||||
if indent == 0 and stripped.startswith("- "):
|
||||
stripped = stripped[2:]
|
||||
normalized.append((" " * indent) + stripped)
|
||||
payload = yaml.safe_load("\n".join(normalized)) or {}
|
||||
payload = self._yaml_value(
|
||||
"\n".join(normalized),
|
||||
path=path,
|
||||
section_path=section_path,
|
||||
field_name="details",
|
||||
fallback={},
|
||||
) or {}
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
def _split_key_value(self, text: str) -> tuple[str, str]:
|
||||
@@ -87,7 +109,17 @@ class DocsIntegrationExtractor:
|
||||
key, value = text.split(":", 1)
|
||||
return key.strip(), value.strip()
|
||||
|
||||
def _yaml_value(self, value: str):
|
||||
def _yaml_value(self, value: str, *, path: str, section_path: str, field_name: str, fallback):
|
||||
if not value:
|
||||
return ""
|
||||
return yaml.safe_load(value)
|
||||
try:
|
||||
return yaml.safe_load(value)
|
||||
except yaml.YAMLError as exc:
|
||||
LOGGER.warning(
|
||||
"docs integration parse warning: path=%s section=%s field=%s reason=%s",
|
||||
path or "<unknown>",
|
||||
section_path,
|
||||
field_name,
|
||||
exc.__class__.__name__,
|
||||
)
|
||||
return fallback
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
|
||||
from app.core.rag.contracts import RagDocument, RagSource
|
||||
from app.core.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
|
||||
from app.core.rag.indexing.docs.classifier import DocsClassifier
|
||||
@@ -15,6 +18,8 @@ from app.core.rag.indexing.docs.relation_extractor import DocsRelationExtractor
|
||||
from app.core.rag.indexing.docs.support_layer_builder import DocsSupportLayerBuilder
|
||||
from app.core.rag.indexing.docs.workflow_extractor import DocsWorkflowExtractor
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocsIndexingPipeline:
|
||||
def __init__(self) -> None:
|
||||
@@ -59,7 +64,11 @@ class DocsIndexingPipeline:
|
||||
for section in sections:
|
||||
docs.append(self._builder.build_doc_chunk(source, section, parsed.frontmatter, doc_kind))
|
||||
document_id = frontmatter_view.document_id or source.path
|
||||
for fact in self._facts.extract(parsed.frontmatter, sections):
|
||||
for fact in self._safe_extract(
|
||||
extractor_name="fact_extractor",
|
||||
path=path,
|
||||
run=lambda: self._facts.extract(parsed.frontmatter, sections),
|
||||
):
|
||||
docs.append(
|
||||
self._support_builder.build_fact(
|
||||
source,
|
||||
@@ -72,13 +81,29 @@ class DocsIndexingPipeline:
|
||||
subdomain=frontmatter_view.subdomain,
|
||||
)
|
||||
)
|
||||
for entity in self._entities.extract(parsed.frontmatter):
|
||||
for entity in self._safe_extract(
|
||||
extractor_name="entity_extractor",
|
||||
path=path,
|
||||
run=lambda: self._entities.extract(parsed.frontmatter),
|
||||
):
|
||||
docs.append(self._builder.build_entity_record(source, parsed.frontmatter, entity))
|
||||
for workflow in self._workflows.extract(parsed.detail_sections):
|
||||
for workflow in self._safe_extract(
|
||||
extractor_name="workflow_extractor",
|
||||
path=path,
|
||||
run=lambda: self._workflows.extract(parsed.detail_sections),
|
||||
):
|
||||
docs.append(self._support_builder.build_workflow_record(source, parsed.frontmatter, workflow))
|
||||
for edge in self._relations.extract(parsed.frontmatter, source_id=document_id):
|
||||
for edge in self._safe_extract(
|
||||
extractor_name="relation_extractor",
|
||||
path=path,
|
||||
run=lambda: self._relations.extract(parsed.frontmatter, source_id=document_id),
|
||||
):
|
||||
docs.append(self._support_builder.build_relation_record(source, parsed.frontmatter, edge))
|
||||
for integration in self._integrations.extract(sections):
|
||||
for integration in self._safe_extract(
|
||||
extractor_name="integration_extractor",
|
||||
path=path,
|
||||
run=lambda: self._integrations.extract(sections, path=path),
|
||||
):
|
||||
docs.append(self._support_builder.build_integration_record(source, parsed.frontmatter, integration))
|
||||
return docs
|
||||
|
||||
@@ -86,3 +111,15 @@ class DocsIndexingPipeline:
|
||||
tail = path.rsplit("/", 1)[-1]
|
||||
stem = tail.rsplit(".", 1)[0]
|
||||
return stem.replace("-", " ").replace("_", " ").strip().title()
|
||||
|
||||
def _safe_extract(self, *, extractor_name: str, path: str, run: Callable[[], list]) -> list:
|
||||
try:
|
||||
return run()
|
||||
except Exception as exc:
|
||||
LOGGER.warning(
|
||||
"docs pipeline extractor warning: path=%s extractor=%s reason=%s",
|
||||
path,
|
||||
extractor_name,
|
||||
exc.__class__.__name__,
|
||||
)
|
||||
return []
|
||||
|
||||
@@ -25,6 +25,8 @@ class RagQueryRepository:
|
||||
exclude_like_patterns: list[str] | None = None,
|
||||
prefer_path_prefixes: list[str] | None = None,
|
||||
prefer_like_patterns: list[str] | None = None,
|
||||
metadata_domain: str | None = None,
|
||||
metadata_subdomain: str | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
sql, params = self._builder.build_retrieve(
|
||||
@@ -38,6 +40,8 @@ class RagQueryRepository:
|
||||
exclude_like_patterns=exclude_like_patterns,
|
||||
prefer_path_prefixes=prefer_path_prefixes,
|
||||
prefer_like_patterns=prefer_like_patterns,
|
||||
metadata_domain=metadata_domain,
|
||||
metadata_subdomain=metadata_subdomain,
|
||||
prefer_non_tests=prefer_non_tests,
|
||||
)
|
||||
with get_engine().connect() as conn:
|
||||
@@ -234,6 +238,54 @@ class RagQueryRepository:
|
||||
rows = conn.execute(stmt, params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
def retrieve_chunks_by_path_substrings(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
*,
|
||||
path_needles: list[str],
|
||||
layers: list[str] | None = None,
|
||||
limit: int = 200,
|
||||
) -> list[dict]:
|
||||
normalized_needles = [str(item).strip().lower() for item in path_needles if str(item).strip()]
|
||||
if not normalized_needles:
|
||||
return []
|
||||
params: dict = {
|
||||
"sid": rag_session_id,
|
||||
"lim": max(1, int(limit)),
|
||||
}
|
||||
filters = ["rag_session_id = :sid"]
|
||||
like_parts: list[str] = []
|
||||
for idx, needle in enumerate(normalized_needles):
|
||||
key = f"needle_{idx}"
|
||||
params[key] = f"%{needle}%"
|
||||
like_parts.append(f"lower(path) LIKE :{key}")
|
||||
filters.append("(" + " OR ".join(like_parts) + ")")
|
||||
if layers:
|
||||
normalized_layers = [str(item).strip() for item in layers if str(item).strip()]
|
||||
if normalized_layers:
|
||||
params["layers"] = normalized_layers
|
||||
filters.append("layer IN :layers")
|
||||
stmt = text(
|
||||
f"""
|
||||
SELECT path, content, layer, title, metadata_json, span_start, span_end,
|
||||
0 AS lexical_rank,
|
||||
0 AS prefer_bonus,
|
||||
0 AS test_penalty,
|
||||
0 AS structural_rank,
|
||||
0 AS layer_rank,
|
||||
0 AS distance
|
||||
FROM rag_chunks
|
||||
WHERE {' AND '.join(filters)}
|
||||
ORDER BY path ASC, COALESCE(span_start, 0) ASC, COALESCE(chunk_index, 0) ASC
|
||||
LIMIT :lim
|
||||
"""
|
||||
)
|
||||
if "layers" in params:
|
||||
stmt = stmt.bindparams(bindparam("layers", expanding=True))
|
||||
with get_engine().connect() as conn:
|
||||
rows = conn.execute(stmt, params).mappings().fetchall()
|
||||
return [self._row_to_dict(row) for row in rows]
|
||||
|
||||
def _row_to_dict(self, row) -> dict:
|
||||
data = dict(row)
|
||||
raw_metadata = data.pop("metadata_json")
|
||||
|
||||
@@ -69,6 +69,8 @@ class RagRepository:
|
||||
exclude_like_patterns: list[str] | None = None,
|
||||
prefer_path_prefixes: list[str] | None = None,
|
||||
prefer_like_patterns: list[str] | None = None,
|
||||
metadata_domain: str | None = None,
|
||||
metadata_subdomain: str | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> list[dict]:
|
||||
return self._query.retrieve(
|
||||
@@ -82,6 +84,8 @@ class RagRepository:
|
||||
exclude_like_patterns=exclude_like_patterns,
|
||||
prefer_path_prefixes=prefer_path_prefixes,
|
||||
prefer_like_patterns=prefer_like_patterns,
|
||||
metadata_domain=metadata_domain,
|
||||
metadata_subdomain=metadata_subdomain,
|
||||
prefer_non_tests=prefer_non_tests,
|
||||
)
|
||||
|
||||
@@ -141,3 +145,18 @@ class RagRepository:
|
||||
layers=layers,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
def retrieve_chunks_by_path_substrings(
|
||||
self,
|
||||
rag_session_id: str,
|
||||
*,
|
||||
path_needles: list[str],
|
||||
layers: list[str] | None = None,
|
||||
limit: int = 200,
|
||||
) -> list[dict]:
|
||||
return self._query.retrieve_chunks_by_path_substrings(
|
||||
rag_session_id,
|
||||
path_needles=path_needles,
|
||||
layers=layers,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
@@ -19,6 +19,8 @@ class RetrievalStatementBuilder:
|
||||
exclude_like_patterns: list[str] | None = None,
|
||||
prefer_path_prefixes: list[str] | None = None,
|
||||
prefer_like_patterns: list[str] | None = None,
|
||||
metadata_domain: str | None = None,
|
||||
metadata_subdomain: str | None = None,
|
||||
prefer_non_tests: bool = False,
|
||||
) -> tuple[str, dict]:
|
||||
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
||||
@@ -29,6 +31,8 @@ class RetrievalStatementBuilder:
|
||||
self._append_prefix_group(filters, params, "path", path_prefixes)
|
||||
self._append_prefix_group(filters, params, "exclude_prefix", exclude_path_prefixes, negate=True)
|
||||
self._append_like_group(filters, params, "exclude_like", exclude_like_patterns, negate=True)
|
||||
self._append_metadata_equals(filters, params, "metadata_domain", "domain", metadata_domain)
|
||||
self._append_metadata_equals(filters, params, "metadata_subdomain", "subdomain", metadata_subdomain)
|
||||
if layers:
|
||||
filters.append("layer = ANY(:layers)")
|
||||
params["layers"] = layers
|
||||
@@ -202,6 +206,20 @@ class RetrievalStatementBuilder:
|
||||
joined = " OR ".join(parts)
|
||||
filters.append(f"NOT ({joined})" if negate else f"({joined})")
|
||||
|
||||
def _append_metadata_equals(
|
||||
self,
|
||||
filters: list[str],
|
||||
params: dict,
|
||||
param_key: str,
|
||||
metadata_key: str,
|
||||
value: str | None,
|
||||
) -> None:
|
||||
normalized = str(value or "").strip().lower()
|
||||
if not normalized:
|
||||
return
|
||||
params[param_key] = normalized
|
||||
filters.append(f"lower(COALESCE({self._metadata_text(metadata_key)}, '')) = :{param_key}")
|
||||
|
||||
def _test_penalty_sql(
|
||||
self,
|
||||
enabled: bool,
|
||||
|
||||
@@ -94,4 +94,8 @@ class RagSessionRetriever:
|
||||
for key in keys:
|
||||
if key in filters:
|
||||
out[key] = filters[key]
|
||||
if "metadata.domain" in filters:
|
||||
out["metadata_domain"] = filters["metadata.domain"]
|
||||
if "metadata.subdomain" in filters:
|
||||
out["metadata_subdomain"] = filters["metadata.subdomain"]
|
||||
return out
|
||||
|
||||
Reference in New Issue
Block a user