Initial commit
This commit is contained in:
94
filters.py
Normal file
94
filters.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""Charset helpers, blacklist censoring, link detection."""
|
||||
import html
|
||||
import random
|
||||
import re
|
||||
import bot_state as state
|
||||
|
||||
_CHARSET_1 = "🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉"
|
||||
_CHARSET_2 = "🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩"
|
||||
_CHARSET_3 = "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ"
|
||||
_CHARSET_4 = "🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿"
|
||||
_CHARSETS = [_CHARSET_1, _CHARSET_2, _CHARSET_3, _CHARSET_4]
|
||||
|
||||
_REPLACEMENT: dict[str, str] = {
|
||||
"a": "@", "e": "3",
|
||||
"i": "1", "o": "0",
|
||||
}
|
||||
|
||||
LINK_PATTERNS: list[re.Pattern] = [
|
||||
re.compile(r"https?://", re.IGNORECASE),
|
||||
re.compile(r"www\s*\.", re.IGNORECASE),
|
||||
re.compile(r"t\s*[\[\(\.]\s*me\s*/", re.IGNORECASE),
|
||||
re.compile(r"t\s+\.\s*me", re.IGNORECASE),
|
||||
re.compile(r"\w+\s*\[\s*\.\s*\]\s*\w+", re.IGNORECASE),
|
||||
re.compile(r"\w+\s*\(\s*\.\s*\)\s*\w+", re.IGNORECASE),
|
||||
re.compile(r"\w+\s*\(\s*dot\s*\)\s*\w+", re.IGNORECASE),
|
||||
re.compile(
|
||||
r"\.(?:com|net|org|info|biz|name|pro|xyz|online|site|website|space|store|shop|blog|tech|dev|app|cloud|"
|
||||
r"digital|solutions|systems|services|agency|group|company|center|world|global|today|live|life|news|media|"
|
||||
r"network|social|community|zone|one|link|io|ai|co|ly|me|gg|tv|to|sh|fm|ws|cc|so|vc|it|page|software|tools|"
|
||||
r"design|studio|lab|labs|build|engineering|data|systems|academy|care|finance|capital|fund|money|loan|loans|"
|
||||
r"credit|insurance|investments|tax|accountants|law|legal|attorney|consulting|partners|ventures|holdings|"
|
||||
r"management|marketing|media|press|events|productions|photos|photography|pictures|video|film|music|audio|"
|
||||
r"games|game|play|fun|chat|dating|love|fans|family|kids|school|education|college|university|training|"
|
||||
r"courses|institute|health|clinic|hospital|doctor|dentist|fitness|gym|yoga|diet|food|restaurant|cafe|"
|
||||
r"coffee|bar|beer|wine|recipes|kitchen|cooking|fashion|style|clothing|shoes|jewelry|beauty|hair|makeup|"
|
||||
r"salon|travel|trips|tours|vacations|holiday|flights|tickets|hotel|hostel|rentals|cars|car|auto|"
|
||||
r"motorcycles|bike|bikes|taxi|delivery|express|logistics|shipping|realty|realestate|homes|house|rent|"
|
||||
r"apartments|property|construction|builders|contractors|repair|cleaning|security|energy|solar|green|eco|"
|
||||
r"farm|garden|flowers|pets|pet|dog|cat|animals|science|research|space|earth|energy|finance|bank|exchange|"
|
||||
r"trade|trading|market|markets|crypto|bitcoin|eth|nft|art|gallery|design|graphics|print|books|library|"
|
||||
r"wiki|guide|help|support|tools|download|software|app|cloud|host|hosting|server|email|mail|tech|network|"
|
||||
r"systems|solutions|world|global|international|express|plus|pro|max|now|top|best|cool|fun|zone|land|city|"
|
||||
r"place|town|country|uk|us|ca|au|de|fr|ru|cn|jp|kr|in|br|za|es|it|nl|se|no|fi|dk|pl|ch|be|at|ie|nz|mx|"
|
||||
r"ar|cl|co|pe|pt|gr|tr|ae|sa|il|sg|hk|id|my|th|vn|ph|pk|bd|ng|ke|gh)\b",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def censor_word(word: str) -> str:
|
||||
charset = random.choice(_CHARSETS)
|
||||
chars = []
|
||||
for char in word:
|
||||
lower = char.lower()
|
||||
if lower in _REPLACEMENT:
|
||||
chars.append(_REPLACEMENT[lower])
|
||||
elif lower.isalpha():
|
||||
idx = ord(lower) - ord("a")
|
||||
chars.append(charset[idx] if 0 <= idx < len(charset) else char)
|
||||
else:
|
||||
chars.append(char)
|
||||
chunks = ["".join(chars[i:i + 3]) for i in range(0, len(chars), 3)]
|
||||
return " ".join(chunks)
|
||||
|
||||
|
||||
def process_blacklisted_message(text: str) -> tuple[str, bool]:
|
||||
if not state.blacklisted_words or not text:
|
||||
return html.escape(text or ""), False
|
||||
matches: list[tuple[int, int, str]] = []
|
||||
for word in state.blacklisted_words:
|
||||
for m in re.finditer(re.escape(word), text, re.IGNORECASE):
|
||||
matches.append((m.start(), m.end(), m.group()))
|
||||
if not matches:
|
||||
return html.escape(text), False
|
||||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||||
filtered, last_end = [], 0
|
||||
for start, end, w in matches:
|
||||
if start >= last_end:
|
||||
filtered.append((start, end, w))
|
||||
last_end = end
|
||||
parts, pos = [], 0
|
||||
for start, end, w in filtered:
|
||||
parts.append(html.escape(text[pos:start]))
|
||||
censored_words = [censor_word(tok) for tok in w.split()]
|
||||
parts.append(f"<tg-spoiler>{' '.join(censored_words)}</tg-spoiler>")
|
||||
pos = end
|
||||
parts.append(html.escape(text[pos:]))
|
||||
return "".join(parts), True
|
||||
|
||||
|
||||
def contains_link(text: str) -> bool:
|
||||
if not text:
|
||||
return False
|
||||
return any(p.search(text) for p in LINK_PATTERNS)
|
||||
Reference in New Issue
Block a user