Initial commit
This commit is contained in:
71
hashing.py
Normal file
71
hashing.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
Media-hash helpers.
|
||||
|
||||
Strategy
|
||||
--------
|
||||
We never download files just to hash them. Instead we use Telegram's
|
||||
file_unique_id as a stable, server-side content identifier – two files
|
||||
that are byte-for-byte identical always share the same file_unique_id,
|
||||
regardless of who uploaded them or when.
|
||||
|
||||
The cache (backup_ids.json) stores:
|
||||
{ "<file_unique_id>": "<sha256_hex>" }
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
from aiogram import Bot
|
||||
|
||||
import bot_state as state
|
||||
from persistence import save_backup_ids
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _hash(file_unique_id: str) -> str:
|
||||
return hashlib.sha256(file_unique_id.encode()).hexdigest()
|
||||
|
||||
|
||||
def register_file(file_unique_id: str) -> bool:
|
||||
"""Add a file to the in-memory cache and persist it.
|
||||
|
||||
Returns True if the file was new, False if it was already known.
|
||||
"""
|
||||
if file_unique_id in state.backup_hashes:
|
||||
return False
|
||||
state.backup_hashes[file_unique_id] = _hash(file_unique_id)
|
||||
save_backup_ids()
|
||||
return True
|
||||
|
||||
|
||||
def is_duplicate(file_unique_id: str) -> bool:
|
||||
return file_unique_id in state.backup_hashes
|
||||
|
||||
|
||||
def check_media_list(media: list[dict]) -> tuple[list[dict], list[dict]]:
|
||||
"""Split a media list into (unique, duplicates)."""
|
||||
unique, dupes = [], []
|
||||
for item in media:
|
||||
if is_duplicate(item.get("file_unique_id", "")):
|
||||
dupes.append(item)
|
||||
else:
|
||||
unique.append(item)
|
||||
return unique, dupes
|
||||
|
||||
|
||||
async def preload_backup_hashes(bot: Bot, chat_id: int) -> None:
|
||||
"""
|
||||
Since the standard Bot API does not expose a bulk message history
|
||||
endpoint for groups, we:
|
||||
1. Load whatever is already in backup_ids.json (done by load_backup_ids).
|
||||
2. On each new message in the backup group, register via register_file.
|
||||
|
||||
To do a full historical reindex, admins forward all older media back
|
||||
into the backup group — the bot registers each file automatically.
|
||||
"""
|
||||
logger.info(
|
||||
"Backup hash cache loaded from disk: %d known files. "
|
||||
"New files arriving in the backup group will be indexed automatically.",
|
||||
len(state.backup_hashes),
|
||||
)
|
||||
Reference in New Issue
Block a user