Files
Telegram_ShBot/hashing.py
2026-05-13 23:38:18 +02:00

71 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Media-hash helpers.
Strategy
--------
We never download files just to hash them. Instead we use Telegram's
file_unique_id as a stable, server-side content identifier two files
that are byte-for-byte identical always share the same file_unique_id,
regardless of who uploaded them or when.
The cache (backup_ids.json) stores:
{ "<file_unique_id>": "<sha256_hex>" }
"""
import hashlib
import logging
from aiogram import Bot
import bot_state as state
from persistence import save_backup_ids
logger = logging.getLogger(__name__)
def _hash(file_unique_id: str) -> str:
return hashlib.sha256(file_unique_id.encode()).hexdigest()
def register_file(file_unique_id: str) -> bool:
"""Add a file to the in-memory cache and persist it.
Returns True if the file was new, False if it was already known.
"""
if file_unique_id in state.backup_hashes:
return False
state.backup_hashes[file_unique_id] = _hash(file_unique_id)
save_backup_ids()
return True
def is_duplicate(file_unique_id: str) -> bool:
return file_unique_id in state.backup_hashes
def check_media_list(media: list[dict]) -> tuple[list[dict], list[dict]]:
"""Split a media list into (unique, duplicates)."""
unique, dupes = [], []
for item in media:
if is_duplicate(item.get("file_unique_id", "")):
dupes.append(item)
else:
unique.append(item)
return unique, dupes
async def preload_backup_hashes(bot: Bot, chat_id: int) -> None:
"""
Since the standard Bot API does not expose a bulk message history
endpoint for groups, we:
1. Load whatever is already in backup_ids.json (done by load_backup_ids).
2. On each new message in the backup group, register via register_file.
To do a full historical reindex, admins forward all older media back
into the backup group — the bot registers each file automatically.
"""
logger.info(
"Backup hash cache loaded from disk: %d known files. "
"New files arriving in the backup group will be indexed automatically.",
len(state.backup_hashes),
)