from __future__ import annotations import re from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer _WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+") class RuEnTermMapper: _CANONICAL_MAP = { "класс": ["class"], "метод": ["method"], "функция": ["function", "def"], "модуль": ["module"], "пакет": ["package"], "файл": ["file"], "тест": ["test", "unit test"], "документация": ["documentation", "docs"], "readme": ["readme"], } _ENGLISH_SOURCES = { "class": ["class"], "method": ["method"], "function": ["function", "def"], "module": ["module"], "package": ["package"], "file": ["file"], "test": ["test", "unit test"], "tests": ["test", "unit test"], "documentation": ["documentation", "docs"], "docs": ["documentation", "docs"], "readme": ["readme"], "def": ["def"], } def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None: self._canonicalizer = canonicalizer or KeyTermCanonicalizer() def expand(self, text: str) -> list[str]: expansions: list[str] = [] lowered = (text or "").lower() for token in _WORD_RE.findall(lowered): canonical = self._canonicalizer.canonicalize(token) or token self._extend(expansions, self._CANONICAL_MAP.get(canonical, [])) self._extend(expansions, self._ENGLISH_SOURCES.get(token, [])) if "unit test" in lowered or "unit tests" in lowered: self._extend(expansions, self._ENGLISH_SOURCES["test"]) return expansions def key_terms(self) -> tuple[str, ...]: return tuple(self._CANONICAL_MAP.keys()) def all_literal_terms(self) -> tuple[str, ...]: values = set(self._canonicalizer.aliases()) values.update(self._CANONICAL_MAP.keys()) values.update(self._ENGLISH_SOURCES.keys()) for targets in self._CANONICAL_MAP.values(): values.update(target.lower() for target in targets) for targets in self._ENGLISH_SOURCES.values(): values.update(target.lower() for target in targets) return tuple(sorted(values)) def _extend(self, result: list[str], values: list[str]) -> None: for value in values: if value not in result: result.append(value)