68 lines
2.4 KiB
Python
68 lines
2.4 KiB
Python
from __future__ import annotations
|
||
|
||
import re
|
||
|
||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||
|
||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||
|
||
|
||
class RuEnTermMapper:
|
||
_CANONICAL_MAP = {
|
||
"класс": ["class"],
|
||
"метод": ["method"],
|
||
"функция": ["function", "def"],
|
||
"модуль": ["module"],
|
||
"пакет": ["package"],
|
||
"файл": ["file"],
|
||
"тест": ["test", "unit test"],
|
||
"документация": ["documentation", "docs"],
|
||
"readme": ["readme"],
|
||
}
|
||
_ENGLISH_SOURCES = {
|
||
"class": ["class"],
|
||
"method": ["method"],
|
||
"function": ["function", "def"],
|
||
"module": ["module"],
|
||
"package": ["package"],
|
||
"file": ["file"],
|
||
"test": ["test", "unit test"],
|
||
"tests": ["test", "unit test"],
|
||
"documentation": ["documentation", "docs"],
|
||
"docs": ["documentation", "docs"],
|
||
"readme": ["readme"],
|
||
"def": ["def"],
|
||
}
|
||
|
||
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
|
||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||
|
||
def expand(self, text: str) -> list[str]:
|
||
expansions: list[str] = []
|
||
lowered = (text or "").lower()
|
||
for token in _WORD_RE.findall(lowered):
|
||
canonical = self._canonicalizer.canonicalize(token) or token
|
||
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
|
||
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
|
||
if "unit test" in lowered or "unit tests" in lowered:
|
||
self._extend(expansions, self._ENGLISH_SOURCES["test"])
|
||
return expansions
|
||
|
||
def key_terms(self) -> tuple[str, ...]:
|
||
return tuple(self._CANONICAL_MAP.keys())
|
||
|
||
def all_literal_terms(self) -> tuple[str, ...]:
|
||
values = set(self._canonicalizer.aliases())
|
||
values.update(self._CANONICAL_MAP.keys())
|
||
values.update(self._ENGLISH_SOURCES.keys())
|
||
for targets in self._CANONICAL_MAP.values():
|
||
values.update(target.lower() for target in targets)
|
||
for targets in self._ENGLISH_SOURCES.values():
|
||
values.update(target.lower() for target in targets)
|
||
return tuple(sorted(values))
|
||
|
||
def _extend(self, result: list[str], values: list[str]) -> None:
|
||
for value in values:
|
||
if value not in result:
|
||
result.append(value)
|