Фиксация изменений

This commit is contained in:
2026-03-05 11:03:17 +03:00
parent 1ef0b4d68c
commit 417b8b6f72
261 changed files with 8215 additions and 332 deletions

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import re
CODE_SPAN_RE = re.compile(r"`[^`]*`")
FILE_PATH_RE = re.compile(
r"(?<!\w)(?:/|\./|\.\./)?(?:[\w.-]+/)*[\w.-]+\.(?:py|md|txt|rst|yaml|yml|json|toml|ini)(?!\w)",
re.IGNORECASE,
)
DOTTED_IDENT_RE = re.compile(r"(?<!\w)(?:[A-Za-z_][A-Za-z0-9_]*\.)+[A-Za-z_][A-Za-z0-9_]*(?!\w)")
CAMEL_RE = re.compile(r"(?<!\w)[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+(?!\w)")
SNAKE_RE = re.compile(r"(?<!\w)[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?!\w)")
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+([,.:;?!])")
SPACE_AFTER_PUNCT_RE = re.compile(r"([,.:;?!])(?=(?:[\"'(\[A-Za-zА-ЯЁа-яё]))")
WS_RE = re.compile(r"\s+")
QUOTE_TRANSLATION = str.maketrans({"«": '"', "»": '"', "": '"', "": '"', "": "'", "": "'"})
class QueryNormalizer:
"""Builds a deterministic canonical form from raw text without enrichment."""
def normalize(self, raw: str) -> str:
text = raw or ""
protected = _ProtectedText()
text = self._protect(text, protected)
text = self._collapse_whitespace(text)
text = text.translate(QUOTE_TRANSLATION)
text = SPACE_BEFORE_PUNCT_RE.sub(r"\1", text)
text = SPACE_AFTER_PUNCT_RE.sub(r"\1 ", text)
text = self._collapse_whitespace(text)
return protected.restore(text)
def _protect(self, text: str, protected: "_ProtectedText") -> str:
for pattern in (CODE_SPAN_RE, FILE_PATH_RE, DOTTED_IDENT_RE, CAMEL_RE, SNAKE_RE):
text = pattern.sub(protected.replace, text)
return text
def _collapse_whitespace(self, text: str) -> str:
return WS_RE.sub(" ", text).strip()
class _ProtectedText:
def __init__(self) -> None:
self._items: dict[str, str] = {}
self._index = 0
def replace(self, match: re.Match[str]) -> str:
placeholder = f"@@P{self._index}@@"
self._items[placeholder] = match.group(0)
self._index += 1
return placeholder
def restore(self, text: str) -> str:
restored = text
for placeholder, value in self._items.items():
restored = restored.replace(placeholder, value)
return restored