"""Charset helpers, blacklist censoring, link detection.""" import html import random import re import bot_state as state _CHARSET_1 = "๐Ÿ„ฐ๐Ÿ„ฑ๐Ÿ„ฒ๐Ÿ„ณ๐Ÿ„ด๐Ÿ„ต๐Ÿ„ถ๐Ÿ„ท๐Ÿ„ธ๐Ÿ„น๐Ÿ„บ๐Ÿ„ป๐Ÿ„ผ๐Ÿ„ฝ๐Ÿ„พ๐Ÿ„ฟ๐Ÿ…€๐Ÿ…๐Ÿ…‚๐Ÿ…ƒ๐Ÿ…„๐Ÿ……๐Ÿ…†๐Ÿ…‡๐Ÿ…ˆ๐Ÿ…‰" _CHARSET_2 = "๐Ÿ…๐Ÿ…‘๐Ÿ…’๐Ÿ…“๐Ÿ…”๐Ÿ…•๐Ÿ…–๐Ÿ…—๐Ÿ…˜๐Ÿ…™๐Ÿ…š๐Ÿ…›๐Ÿ…œ๐Ÿ…๐Ÿ…ž๐Ÿ…Ÿ๐Ÿ… ๐Ÿ…ก๐Ÿ…ข๐Ÿ…ฃ๐Ÿ…ค๐Ÿ…ฅ๐Ÿ…ฆ๐Ÿ…ง๐Ÿ…จ๐Ÿ…ฉ" _CHARSET_3 = "โ’ถโ’ทโ’ธโ’นโ’บโ’ปโ’ผโ’ฝโ’พโ’ฟโ“€โ“โ“‚โ“ƒโ“„โ“…โ“†โ“‡โ“ˆโ“‰โ“Šโ“‹โ“Œโ“โ“Žโ“" _CHARSET_4 = "๐Ÿ‡ฆ๐Ÿ‡ง๐Ÿ‡จ๐Ÿ‡ฉ๐Ÿ‡ช๐Ÿ‡ซ๐Ÿ‡ฌ๐Ÿ‡ญ๐Ÿ‡ฎ๐Ÿ‡ฏ๐Ÿ‡ฐ๐Ÿ‡ฑ๐Ÿ‡ฒ๐Ÿ‡ณ๐Ÿ‡ด๐Ÿ‡ต๐Ÿ‡ถ๐Ÿ‡ท๐Ÿ‡ธ๐Ÿ‡น๐Ÿ‡บ๐Ÿ‡ป๐Ÿ‡ผ๐Ÿ‡ฝ๐Ÿ‡พ๐Ÿ‡ฟ" _CHARSETS = [_CHARSET_1, _CHARSET_2, _CHARSET_3, _CHARSET_4] _REPLACEMENT: dict[str, str] = { "a": "@", "e": "3", "i": "1", "o": "0", } LINK_PATTERNS: list[re.Pattern] = [ re.compile(r"https?://", re.IGNORECASE), re.compile(r"www\s*\.", re.IGNORECASE), re.compile(r"t\s*[\[\(\.]\s*me\s*/", re.IGNORECASE), re.compile(r"t\s+\.\s*me", re.IGNORECASE), re.compile(r"\w+\s*\[\s*\.\s*\]\s*\w+", re.IGNORECASE), re.compile(r"\w+\s*\(\s*\.\s*\)\s*\w+", re.IGNORECASE), re.compile(r"\w+\s*\(\s*dot\s*\)\s*\w+", re.IGNORECASE), re.compile( r"\.(?:com|net|org|info|biz|name|pro|xyz|online|site|website|space|store|shop|blog|tech|dev|app|cloud|" r"digital|solutions|systems|services|agency|group|company|center|world|global|today|live|life|news|media|" r"network|social|community|zone|one|link|io|ai|co|ly|me|gg|tv|to|sh|fm|ws|cc|so|vc|it|page|software|tools|" r"design|studio|lab|labs|build|engineering|data|systems|academy|care|finance|capital|fund|money|loan|loans|" r"credit|insurance|investments|tax|accountants|law|legal|attorney|consulting|partners|ventures|holdings|" r"management|marketing|media|press|events|productions|photos|photography|pictures|video|film|music|audio|" r"games|game|play|fun|chat|dating|love|fans|family|kids|school|education|college|university|training|" r"courses|institute|health|clinic|hospital|doctor|dentist|fitness|gym|yoga|diet|food|restaurant|cafe|" r"coffee|bar|beer|wine|recipes|kitchen|cooking|fashion|style|clothing|shoes|jewelry|beauty|hair|makeup|" r"salon|travel|trips|tours|vacations|holiday|flights|tickets|hotel|hostel|rentals|cars|car|auto|" r"motorcycles|bike|bikes|taxi|delivery|express|logistics|shipping|realty|realestate|homes|house|rent|" r"apartments|property|construction|builders|contractors|repair|cleaning|security|energy|solar|green|eco|" r"farm|garden|flowers|pets|pet|dog|cat|animals|science|research|space|earth|energy|finance|bank|exchange|" r"trade|trading|market|markets|crypto|bitcoin|eth|nft|art|gallery|design|graphics|print|books|library|" r"wiki|guide|help|support|tools|download|software|app|cloud|host|hosting|server|email|mail|tech|network|" r"systems|solutions|world|global|international|express|plus|pro|max|now|top|best|cool|fun|zone|land|city|" r"place|town|country|uk|us|ca|au|de|fr|ru|cn|jp|kr|in|br|za|es|it|nl|se|no|fi|dk|pl|ch|be|at|ie|nz|mx|" r"ar|cl|co|pe|pt|gr|tr|ae|sa|il|sg|hk|id|my|th|vn|ph|pk|bd|ng|ke|gh)\b", re.IGNORECASE, ), ] def censor_word(word: str) -> str: charset = random.choice(_CHARSETS) chars = [] for char in word: lower = char.lower() if lower in _REPLACEMENT: chars.append(_REPLACEMENT[lower]) elif lower.isalpha(): idx = ord(lower) - ord("a") chars.append(charset[idx] if 0 <= idx < len(charset) else char) else: chars.append(char) chunks = ["".join(chars[i:i + 3]) for i in range(0, len(chars), 3)] return " ".join(chunks) def process_blacklisted_message(text: str) -> tuple[str, bool]: if not state.blacklisted_words or not text: return html.escape(text or ""), False matches: list[tuple[int, int, str]] = [] for word in state.blacklisted_words: for m in re.finditer(re.escape(word), text, re.IGNORECASE): matches.append((m.start(), m.end(), m.group())) if not matches: return html.escape(text), False matches.sort(key=lambda x: (x[0], -(x[1] - x[0]))) filtered, last_end = [], 0 for start, end, w in matches: if start >= last_end: filtered.append((start, end, w)) last_end = end parts, pos = [], 0 for start, end, w in filtered: parts.append(html.escape(text[pos:start])) censored_words = [censor_word(tok) for tok in w.split()] parts.append(f"{' '.join(censored_words)}") pos = end parts.append(html.escape(text[pos:])) return "".join(parts), True def contains_link(text: str) -> bool: if not text: return False return any(p.search(text) for p in LINK_PATTERNS)