Фиксация изменений
This commit is contained in:
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
67
app/modules/rag/intent_router_v2/term_mapping.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
|
||||
|
||||
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
|
||||
|
||||
|
||||
class RuEnTermMapper:
|
||||
_CANONICAL_MAP = {
|
||||
"класс": ["class"],
|
||||
"метод": ["method"],
|
||||
"функция": ["function", "def"],
|
||||
"модуль": ["module"],
|
||||
"пакет": ["package"],
|
||||
"файл": ["file"],
|
||||
"тест": ["test", "unit test"],
|
||||
"документация": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
}
|
||||
_ENGLISH_SOURCES = {
|
||||
"class": ["class"],
|
||||
"method": ["method"],
|
||||
"function": ["function", "def"],
|
||||
"module": ["module"],
|
||||
"package": ["package"],
|
||||
"file": ["file"],
|
||||
"test": ["test", "unit test"],
|
||||
"tests": ["test", "unit test"],
|
||||
"documentation": ["documentation", "docs"],
|
||||
"docs": ["documentation", "docs"],
|
||||
"readme": ["readme"],
|
||||
"def": ["def"],
|
||||
}
|
||||
|
||||
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
|
||||
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
|
||||
|
||||
def expand(self, text: str) -> list[str]:
|
||||
expansions: list[str] = []
|
||||
lowered = (text or "").lower()
|
||||
for token in _WORD_RE.findall(lowered):
|
||||
canonical = self._canonicalizer.canonicalize(token) or token
|
||||
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
|
||||
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
|
||||
if "unit test" in lowered or "unit tests" in lowered:
|
||||
self._extend(expansions, self._ENGLISH_SOURCES["test"])
|
||||
return expansions
|
||||
|
||||
def key_terms(self) -> tuple[str, ...]:
|
||||
return tuple(self._CANONICAL_MAP.keys())
|
||||
|
||||
def all_literal_terms(self) -> tuple[str, ...]:
|
||||
values = set(self._canonicalizer.aliases())
|
||||
values.update(self._CANONICAL_MAP.keys())
|
||||
values.update(self._ENGLISH_SOURCES.keys())
|
||||
for targets in self._CANONICAL_MAP.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
for targets in self._ENGLISH_SOURCES.values():
|
||||
values.update(target.lower() for target in targets)
|
||||
return tuple(sorted(values))
|
||||
|
||||
def _extend(self, result: list[str], values: list[str]) -> None:
|
||||
for value in values:
|
||||
if value not in result:
|
||||
result.append(value)
|
||||
Reference in New Issue
Block a user