58 lines
2.1 KiB
Python
58 lines
2.1 KiB
Python
from __future__ import annotations
|
||
|
||
import re
|
||
|
||
CODE_SPAN_RE = re.compile(r"`[^`]*`")
|
||
FILE_PATH_RE = re.compile(
|
||
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
|
||
re.IGNORECASE,
|
||
)
|
||
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
|
||
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
|
||
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
|
||
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
|
||
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
|
||
WS_RE = re.compile(r"\s+")
|
||
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "“": '"', "”": '"', "‘": "'", "’": "'"})
|
||
|
||
|
||
class QueryNormalizer:
|
||
"""Builds a deterministic canonical form from raw text without enrichment."""
|
||
|
||
def normalize(self, raw: str) -> str:
|
||
text = raw or ""
|
||
protected = _ProtectedText()
|
||
text = self._protect(text, protected)
|
||
text = self._collapse_whitespace(text)
|
||
text = text.translate(QUOTE_TRANSLATION)
|
||
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
|
||
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
|
||
text = self._collapse_whitespace(text)
|
||
return protected.restore(text)
|
||
|
||
def _protect(self, text: str, protected: "_ProtectedText") -> str:
|
||
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
|
||
text = pattern.sub(protected.replace, text)
|
||
return text
|
||
|
||
def _collapse_whitespace(self, text: str) -> str:
|
||
return WS_RE.sub(" ", text).strip()
|
||
|
||
|
||
class _ProtectedText:
|
||
def __init__(self) -> None:
|
||
self._items: dict[str, str] = {}
|
||
self._index = 0
|
||
|
||
def replace(self, match: re.Match[str]) -> str:
|
||
placeholder = f"@@P{self._index}@@"
|
||
self._items[placeholder] = match.group(0)
|
||
self._index += 1
|
||
return placeholder
|
||
|
||
def restore(self, text: str) -> str:
|
||
restored = text
|
||
for placeholder, value in self._items.items():
|
||
restored = restored.replace(placeholder, value)
|
||
return restored
|