Initial commit

This commit is contained in:
2026-05-13 23:38:18 +02:00
commit 8b053a7adb
21 changed files with 6642 additions and 0 deletions

71
hashing.py Normal file
View File

@@ -0,0 +1,71 @@
"""
Media-hash helpers.
Strategy
--------
We never download files just to hash them. Instead we use Telegram's
file_unique_id as a stable, server-side content identifier two files
that are byte-for-byte identical always share the same file_unique_id,
regardless of who uploaded them or when.
The cache (backup_ids.json) stores:
{ "<file_unique_id>": "<sha256_hex>" }
"""
import hashlib
import logging
from aiogram import Bot
import bot_state as state
from persistence import save_backup_ids
logger = logging.getLogger(__name__)
def _hash(file_unique_id: str) -> str:
return hashlib.sha256(file_unique_id.encode()).hexdigest()
def register_file(file_unique_id: str) -> bool:
"""Add a file to the in-memory cache and persist it.
Returns True if the file was new, False if it was already known.
"""
if file_unique_id in state.backup_hashes:
return False
state.backup_hashes[file_unique_id] = _hash(file_unique_id)
save_backup_ids()
return True
def is_duplicate(file_unique_id: str) -> bool:
return file_unique_id in state.backup_hashes
def check_media_list(media: list[dict]) -> tuple[list[dict], list[dict]]:
"""Split a media list into (unique, duplicates)."""
unique, dupes = [], []
for item in media:
if is_duplicate(item.get("file_unique_id", "")):
dupes.append(item)
else:
unique.append(item)
return unique, dupes
async def preload_backup_hashes(bot: Bot, chat_id: int) -> None:
"""
Since the standard Bot API does not expose a bulk message history
endpoint for groups, we:
1. Load whatever is already in backup_ids.json (done by load_backup_ids).
2. On each new message in the backup group, register via register_file.
To do a full historical reindex, admins forward all older media back
into the backup group — the bot registers each file automatically.
"""
logger.info(
"Backup hash cache loaded from disk: %d known files. "
"New files arriving in the backup group will be indexed automatically.",
len(state.backup_hashes),
)