Telegram_ShBot/filters.py

"""Charset helpers, blacklist censoring, link detection."""
import html
import random
import re
import bot_state as state

_CHARSET_1 = "🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉"
_CHARSET_2 = "🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩"
_CHARSET_3 = "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ"
_CHARSET_4 = "🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿"
_CHARSETS = [_CHARSET_1, _CHARSET_2, _CHARSET_3, _CHARSET_4]

_REPLACEMENT: dict[str, str] = {
    "a": "@", "e": "3",
    "i": "1", "o": "0",
}

LINK_PATTERNS: list[re.Pattern] = [
    re.compile(r"https?://", re.IGNORECASE),
    re.compile(r"www\s*\.", re.IGNORECASE),
    re.compile(r"t\s*[\[\(\.]\s*me\s*/", re.IGNORECASE),
    re.compile(r"t\s+\.\s*me", re.IGNORECASE),
    re.compile(r"\w+\s*\[\s*\.\s*\]\s*\w+", re.IGNORECASE),
    re.compile(r"\w+\s*\(\s*\.\s*\)\s*\w+", re.IGNORECASE),
    re.compile(r"\w+\s*\(\s*dot\s*\)\s*\w+", re.IGNORECASE),
    re.compile(
        r"\.(?:com|net|org|info|biz|name|pro|xyz|online|site|website|space|store|shop|blog|tech|dev|app|cloud|"
        r"digital|solutions|systems|services|agency|group|company|center|world|global|today|live|life|news|media|"
        r"network|social|community|zone|one|link|io|ai|co|ly|me|gg|tv|to|sh|fm|ws|cc|so|vc|it|page|software|tools|"
        r"design|studio|lab|labs|build|engineering|data|systems|academy|care|finance|capital|fund|money|loan|loans|"
        r"credit|insurance|investments|tax|accountants|law|legal|attorney|consulting|partners|ventures|holdings|"
        r"management|marketing|media|press|events|productions|photos|photography|pictures|video|film|music|audio|"
        r"games|game|play|fun|chat|dating|love|fans|family|kids|school|education|college|university|training|"
        r"courses|institute|health|clinic|hospital|doctor|dentist|fitness|gym|yoga|diet|food|restaurant|cafe|"
        r"coffee|bar|beer|wine|recipes|kitchen|cooking|fashion|style|clothing|shoes|jewelry|beauty|hair|makeup|"
        r"salon|travel|trips|tours|vacations|holiday|flights|tickets|hotel|hostel|rentals|cars|car|auto|"
        r"motorcycles|bike|bikes|taxi|delivery|express|logistics|shipping|realty|realestate|homes|house|rent|"
        r"apartments|property|construction|builders|contractors|repair|cleaning|security|energy|solar|green|eco|"
        r"farm|garden|flowers|pets|pet|dog|cat|animals|science|research|space|earth|energy|finance|bank|exchange|"
        r"trade|trading|market|markets|crypto|bitcoin|eth|nft|art|gallery|design|graphics|print|books|library|"
        r"wiki|guide|help|support|tools|download|software|app|cloud|host|hosting|server|email|mail|tech|network|"
        r"systems|solutions|world|global|international|express|plus|pro|max|now|top|best|cool|fun|zone|land|city|"
        r"place|town|country|uk|us|ca|au|de|fr|ru|cn|jp|kr|in|br|za|es|it|nl|se|no|fi|dk|pl|ch|be|at|ie|nz|mx|"
        r"ar|cl|co|pe|pt|gr|tr|ae|sa|il|sg|hk|id|my|th|vn|ph|pk|bd|ng|ke|gh)\b",
        re.IGNORECASE,
    ),
]


def censor_word(word: str) -> str:
    charset = random.choice(_CHARSETS)
    chars = []
    for char in word:
        lower = char.lower()
        if lower in _REPLACEMENT:
            chars.append(_REPLACEMENT[lower])
        elif lower.isalpha():
            idx = ord(lower) - ord("a")
            chars.append(charset[idx] if 0 <= idx < len(charset) else char)
        else:
            chars.append(char)
    chunks = ["".join(chars[i:i + 3]) for i in range(0, len(chars), 3)]
    return " ".join(chunks)


def process_blacklisted_message(text: str) -> tuple[str, bool]:
    if not state.blacklisted_words or not text:
        return html.escape(text or ""), False
    matches: list[tuple[int, int, str]] = []
    for word in state.blacklisted_words:
        for m in re.finditer(re.escape(word), text, re.IGNORECASE):
            matches.append((m.start(), m.end(), m.group()))
    if not matches:
        return html.escape(text), False
    matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
    filtered, last_end = [], 0
    for start, end, w in matches:
        if start >= last_end:
            filtered.append((start, end, w))
            last_end = end
    parts, pos = [], 0
    for start, end, w in filtered:
        parts.append(html.escape(text[pos:start]))
        censored_words = [censor_word(tok) for tok in w.split()]
        parts.append(f"<tg-spoiler>{' '.join(censored_words)}</tg-spoiler>")
        pos = end
    parts.append(html.escape(text[pos:]))
    return "".join(parts), True


def contains_link(text: str) -> bool:
    if not text:
        return False
    return any(p.search(text) for p in LINK_PATTERNS)