94 lines
4.7 KiB
Python
94 lines
4.7 KiB
Python
"""Charset helpers, blacklist censoring, link detection."""
|
|
import html
|
|
import random
|
|
import re
|
|
import bot_state as state
|
|
|
|
_CHARSET_1 = "🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉"
|
|
_CHARSET_2 = "🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩"
|
|
_CHARSET_3 = "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ"
|
|
_CHARSET_4 = "🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿"
|
|
_CHARSETS = [_CHARSET_1, _CHARSET_2, _CHARSET_3, _CHARSET_4]
|
|
|
|
_REPLACEMENT: dict[str, str] = {
|
|
"a": "@", "e": "3",
|
|
"i": "1", "o": "0",
|
|
}
|
|
|
|
LINK_PATTERNS: list[re.Pattern] = [
|
|
re.compile(r"https?://", re.IGNORECASE),
|
|
re.compile(r"www\s*\.", re.IGNORECASE),
|
|
re.compile(r"t\s*[\[\(\.]\s*me\s*/", re.IGNORECASE),
|
|
re.compile(r"t\s+\.\s*me", re.IGNORECASE),
|
|
re.compile(r"\w+\s*\[\s*\.\s*\]\s*\w+", re.IGNORECASE),
|
|
re.compile(r"\w+\s*\(\s*\.\s*\)\s*\w+", re.IGNORECASE),
|
|
re.compile(r"\w+\s*\(\s*dot\s*\)\s*\w+", re.IGNORECASE),
|
|
re.compile(
|
|
r"\.(?:com|net|org|info|biz|name|pro|xyz|online|site|website|space|store|shop|blog|tech|dev|app|cloud|"
|
|
r"digital|solutions|systems|services|agency|group|company|center|world|global|today|live|life|news|media|"
|
|
r"network|social|community|zone|one|link|io|ai|co|ly|me|gg|tv|to|sh|fm|ws|cc|so|vc|it|page|software|tools|"
|
|
r"design|studio|lab|labs|build|engineering|data|systems|academy|care|finance|capital|fund|money|loan|loans|"
|
|
r"credit|insurance|investments|tax|accountants|law|legal|attorney|consulting|partners|ventures|holdings|"
|
|
r"management|marketing|media|press|events|productions|photos|photography|pictures|video|film|music|audio|"
|
|
r"games|game|play|fun|chat|dating|love|fans|family|kids|school|education|college|university|training|"
|
|
r"courses|institute|health|clinic|hospital|doctor|dentist|fitness|gym|yoga|diet|food|restaurant|cafe|"
|
|
r"coffee|bar|beer|wine|recipes|kitchen|cooking|fashion|style|clothing|shoes|jewelry|beauty|hair|makeup|"
|
|
r"salon|travel|trips|tours|vacations|holiday|flights|tickets|hotel|hostel|rentals|cars|car|auto|"
|
|
r"motorcycles|bike|bikes|taxi|delivery|express|logistics|shipping|realty|realestate|homes|house|rent|"
|
|
r"apartments|property|construction|builders|contractors|repair|cleaning|security|energy|solar|green|eco|"
|
|
r"farm|garden|flowers|pets|pet|dog|cat|animals|science|research|space|earth|energy|finance|bank|exchange|"
|
|
r"trade|trading|market|markets|crypto|bitcoin|eth|nft|art|gallery|design|graphics|print|books|library|"
|
|
r"wiki|guide|help|support|tools|download|software|app|cloud|host|hosting|server|email|mail|tech|network|"
|
|
r"systems|solutions|world|global|international|express|plus|pro|max|now|top|best|cool|fun|zone|land|city|"
|
|
r"place|town|country|uk|us|ca|au|de|fr|ru|cn|jp|kr|in|br|za|es|it|nl|se|no|fi|dk|pl|ch|be|at|ie|nz|mx|"
|
|
r"ar|cl|co|pe|pt|gr|tr|ae|sa|il|sg|hk|id|my|th|vn|ph|pk|bd|ng|ke|gh)\b",
|
|
re.IGNORECASE,
|
|
),
|
|
]
|
|
|
|
|
|
def censor_word(word: str) -> str:
|
|
charset = random.choice(_CHARSETS)
|
|
chars = []
|
|
for char in word:
|
|
lower = char.lower()
|
|
if lower in _REPLACEMENT:
|
|
chars.append(_REPLACEMENT[lower])
|
|
elif lower.isalpha():
|
|
idx = ord(lower) - ord("a")
|
|
chars.append(charset[idx] if 0 <= idx < len(charset) else char)
|
|
else:
|
|
chars.append(char)
|
|
chunks = ["".join(chars[i:i + 3]) for i in range(0, len(chars), 3)]
|
|
return " ".join(chunks)
|
|
|
|
|
|
def process_blacklisted_message(text: str) -> tuple[str, bool]:
|
|
if not state.blacklisted_words or not text:
|
|
return html.escape(text or ""), False
|
|
matches: list[tuple[int, int, str]] = []
|
|
for word in state.blacklisted_words:
|
|
for m in re.finditer(re.escape(word), text, re.IGNORECASE):
|
|
matches.append((m.start(), m.end(), m.group()))
|
|
if not matches:
|
|
return html.escape(text), False
|
|
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
|
filtered, last_end = [], 0
|
|
for start, end, w in matches:
|
|
if start >= last_end:
|
|
filtered.append((start, end, w))
|
|
last_end = end
|
|
parts, pos = [], 0
|
|
for start, end, w in filtered:
|
|
parts.append(html.escape(text[pos:start]))
|
|
censored_words = [censor_word(tok) for tok in w.split()]
|
|
parts.append(f"<tg-spoiler>{' '.join(censored_words)}</tg-spoiler>")
|
|
pos = end
|
|
parts.append(html.escape(text[pos:]))
|
|
return "".join(parts), True
|
|
|
|
|
|
def contains_link(text: str) -> bool:
|
|
if not text:
|
|
return False
|
|
return any(p.search(text) for p in LINK_PATTERNS) |