Files
Telegram_ShBot/filters.py
2026-05-13 23:38:18 +02:00

94 lines
4.7 KiB
Python

"""Charset helpers, blacklist censoring, link detection."""
import html
import random
import re
import bot_state as state
_CHARSET_1 = "🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉"
_CHARSET_2 = "🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩"
_CHARSET_3 = "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ"
_CHARSET_4 = "🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿"
_CHARSETS = [_CHARSET_1, _CHARSET_2, _CHARSET_3, _CHARSET_4]
_REPLACEMENT: dict[str, str] = {
"a": "@", "e": "3",
"i": "1", "o": "0",
}
LINK_PATTERNS: list[re.Pattern] = [
re.compile(r"https?://", re.IGNORECASE),
re.compile(r"www\s*\.", re.IGNORECASE),
re.compile(r"t\s*[\[\(\.]\s*me\s*/", re.IGNORECASE),
re.compile(r"t\s+\.\s*me", re.IGNORECASE),
re.compile(r"\w+\s*\[\s*\.\s*\]\s*\w+", re.IGNORECASE),
re.compile(r"\w+\s*\(\s*\.\s*\)\s*\w+", re.IGNORECASE),
re.compile(r"\w+\s*\(\s*dot\s*\)\s*\w+", re.IGNORECASE),
re.compile(
r"\.(?:com|net|org|info|biz|name|pro|xyz|online|site|website|space|store|shop|blog|tech|dev|app|cloud|"
r"digital|solutions|systems|services|agency|group|company|center|world|global|today|live|life|news|media|"
r"network|social|community|zone|one|link|io|ai|co|ly|me|gg|tv|to|sh|fm|ws|cc|so|vc|it|page|software|tools|"
r"design|studio|lab|labs|build|engineering|data|systems|academy|care|finance|capital|fund|money|loan|loans|"
r"credit|insurance|investments|tax|accountants|law|legal|attorney|consulting|partners|ventures|holdings|"
r"management|marketing|media|press|events|productions|photos|photography|pictures|video|film|music|audio|"
r"games|game|play|fun|chat|dating|love|fans|family|kids|school|education|college|university|training|"
r"courses|institute|health|clinic|hospital|doctor|dentist|fitness|gym|yoga|diet|food|restaurant|cafe|"
r"coffee|bar|beer|wine|recipes|kitchen|cooking|fashion|style|clothing|shoes|jewelry|beauty|hair|makeup|"
r"salon|travel|trips|tours|vacations|holiday|flights|tickets|hotel|hostel|rentals|cars|car|auto|"
r"motorcycles|bike|bikes|taxi|delivery|express|logistics|shipping|realty|realestate|homes|house|rent|"
r"apartments|property|construction|builders|contractors|repair|cleaning|security|energy|solar|green|eco|"
r"farm|garden|flowers|pets|pet|dog|cat|animals|science|research|space|earth|energy|finance|bank|exchange|"
r"trade|trading|market|markets|crypto|bitcoin|eth|nft|art|gallery|design|graphics|print|books|library|"
r"wiki|guide|help|support|tools|download|software|app|cloud|host|hosting|server|email|mail|tech|network|"
r"systems|solutions|world|global|international|express|plus|pro|max|now|top|best|cool|fun|zone|land|city|"
r"place|town|country|uk|us|ca|au|de|fr|ru|cn|jp|kr|in|br|za|es|it|nl|se|no|fi|dk|pl|ch|be|at|ie|nz|mx|"
r"ar|cl|co|pe|pt|gr|tr|ae|sa|il|sg|hk|id|my|th|vn|ph|pk|bd|ng|ke|gh)\b",
re.IGNORECASE,
),
]
def censor_word(word: str) -> str:
charset = random.choice(_CHARSETS)
chars = []
for char in word:
lower = char.lower()
if lower in _REPLACEMENT:
chars.append(_REPLACEMENT[lower])
elif lower.isalpha():
idx = ord(lower) - ord("a")
chars.append(charset[idx] if 0 <= idx < len(charset) else char)
else:
chars.append(char)
chunks = ["".join(chars[i:i + 3]) for i in range(0, len(chars), 3)]
return " ".join(chunks)
def process_blacklisted_message(text: str) -> tuple[str, bool]:
if not state.blacklisted_words or not text:
return html.escape(text or ""), False
matches: list[tuple[int, int, str]] = []
for word in state.blacklisted_words:
for m in re.finditer(re.escape(word), text, re.IGNORECASE):
matches.append((m.start(), m.end(), m.group()))
if not matches:
return html.escape(text), False
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
filtered, last_end = [], 0
for start, end, w in matches:
if start >= last_end:
filtered.append((start, end, w))
last_end = end
parts, pos = [], 0
for start, end, w in filtered:
parts.append(html.escape(text[pos:start]))
censored_words = [censor_word(tok) for tok in w.split()]
parts.append(f"<tg-spoiler>{' '.join(censored_words)}</tg-spoiler>")
pos = end
parts.append(html.escape(text[pos:]))
return "".join(parts), True
def contains_link(text: str) -> bool:
if not text:
return False
return any(p.search(text) for p in LINK_PATTERNS)