71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
"""
|
||
Media-hash helpers.
|
||
|
||
Strategy
|
||
--------
|
||
We never download files just to hash them. Instead we use Telegram's
|
||
file_unique_id as a stable, server-side content identifier – two files
|
||
that are byte-for-byte identical always share the same file_unique_id,
|
||
regardless of who uploaded them or when.
|
||
|
||
The cache (backup_ids.json) stores:
|
||
{ "<file_unique_id>": "<sha256_hex>" }
|
||
"""
|
||
|
||
import hashlib
|
||
import logging
|
||
|
||
from aiogram import Bot
|
||
|
||
import bot_state as state
|
||
from persistence import save_backup_ids
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _hash(file_unique_id: str) -> str:
|
||
return hashlib.sha256(file_unique_id.encode()).hexdigest()
|
||
|
||
|
||
def register_file(file_unique_id: str) -> bool:
|
||
"""Add a file to the in-memory cache and persist it.
|
||
|
||
Returns True if the file was new, False if it was already known.
|
||
"""
|
||
if file_unique_id in state.backup_hashes:
|
||
return False
|
||
state.backup_hashes[file_unique_id] = _hash(file_unique_id)
|
||
save_backup_ids()
|
||
return True
|
||
|
||
|
||
def is_duplicate(file_unique_id: str) -> bool:
|
||
return file_unique_id in state.backup_hashes
|
||
|
||
|
||
def check_media_list(media: list[dict]) -> tuple[list[dict], list[dict]]:
|
||
"""Split a media list into (unique, duplicates)."""
|
||
unique, dupes = [], []
|
||
for item in media:
|
||
if is_duplicate(item.get("file_unique_id", "")):
|
||
dupes.append(item)
|
||
else:
|
||
unique.append(item)
|
||
return unique, dupes
|
||
|
||
|
||
async def preload_backup_hashes(bot: Bot, chat_id: int) -> None:
|
||
"""
|
||
Since the standard Bot API does not expose a bulk message history
|
||
endpoint for groups, we:
|
||
1. Load whatever is already in backup_ids.json (done by load_backup_ids).
|
||
2. On each new message in the backup group, register via register_file.
|
||
|
||
To do a full historical reindex, admins forward all older media back
|
||
into the backup group — the bot registers each file automatically.
|
||
"""
|
||
logger.info(
|
||
"Backup hash cache loaded from disk: %d known files. "
|
||
"New files arriving in the backup group will be indexed automatically.",
|
||
len(state.backup_hashes),
|
||
) |