""" Media-hash helpers. Strategy -------- We never download files just to hash them. Instead we use Telegram's file_unique_id as a stable, server-side content identifier – two files that are byte-for-byte identical always share the same file_unique_id, regardless of who uploaded them or when. The cache (backup_ids.json) stores: { "": "" } """ import hashlib import logging from aiogram import Bot import bot_state as state from persistence import save_backup_ids logger = logging.getLogger(__name__) def _hash(file_unique_id: str) -> str: return hashlib.sha256(file_unique_id.encode()).hexdigest() def register_file(file_unique_id: str) -> bool: """Add a file to the in-memory cache and persist it. Returns True if the file was new, False if it was already known. """ if file_unique_id in state.backup_hashes: return False state.backup_hashes[file_unique_id] = _hash(file_unique_id) save_backup_ids() return True def is_duplicate(file_unique_id: str) -> bool: return file_unique_id in state.backup_hashes def check_media_list(media: list[dict]) -> tuple[list[dict], list[dict]]: """Split a media list into (unique, duplicates).""" unique, dupes = [], [] for item in media: if is_duplicate(item.get("file_unique_id", "")): dupes.append(item) else: unique.append(item) return unique, dupes async def preload_backup_hashes(bot: Bot, chat_id: int) -> None: """ Since the standard Bot API does not expose a bulk message history endpoint for groups, we: 1. Load whatever is already in backup_ids.json (done by load_backup_ids). 2. On each new message in the backup group, register via register_file. To do a full historical reindex, admins forward all older media back into the backup group — the bot registers each file automatically. """ logger.info( "Backup hash cache loaded from disk: %d known files. " "New files arriving in the backup group will be indexed automatically.", len(state.backup_hashes), )