Files
agent/app/modules/rag/intent_router_v2/term_mapping.py

68 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from app.modules.rag.intent_router_v2.normalization_terms import KeyTermCanonicalizer
_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё-]+")
class RuEnTermMapper:
_CANONICAL_MAP = {
"класс": ["class"],
"метод": ["method"],
"функция": ["function", "def"],
"модуль": ["module"],
"пакет": ["package"],
"файл": ["file"],
"тест": ["test", "unit test"],
"документация": ["documentation", "docs"],
"readme": ["readme"],
}
_ENGLISH_SOURCES = {
"class": ["class"],
"method": ["method"],
"function": ["function", "def"],
"module": ["module"],
"package": ["package"],
"file": ["file"],
"test": ["test", "unit test"],
"tests": ["test", "unit test"],
"documentation": ["documentation", "docs"],
"docs": ["documentation", "docs"],
"readme": ["readme"],
"def": ["def"],
}
def __init__(self, canonicalizer: KeyTermCanonicalizer | None = None) -> None:
self._canonicalizer = canonicalizer or KeyTermCanonicalizer()
def expand(self, text: str) -> list[str]:
expansions: list[str] = []
lowered = (text or "").lower()
for token in _WORD_RE.findall(lowered):
canonical = self._canonicalizer.canonicalize(token) or token
self._extend(expansions, self._CANONICAL_MAP.get(canonical, []))
self._extend(expansions, self._ENGLISH_SOURCES.get(token, []))
if "unit test" in lowered or "unit tests" in lowered:
self._extend(expansions, self._ENGLISH_SOURCES["test"])
return expansions
def key_terms(self) -> tuple[str, ...]:
return tuple(self._CANONICAL_MAP.keys())
def all_literal_terms(self) -> tuple[str, ...]:
values = set(self._canonicalizer.aliases())
values.update(self._CANONICAL_MAP.keys())
values.update(self._ENGLISH_SOURCES.keys())
for targets in self._CANONICAL_MAP.values():
values.update(target.lower() for target in targets)
for targets in self._ENGLISH_SOURCES.values():
values.update(target.lower() for target in targets)
return tuple(sorted(values))
def _extend(self, result: list[str], values: list[str]) -> None:
for value in values:
if value not in result:
result.append(value)