#!/usr/bin/env python3 """ Скрипт для проверки прогресса обработки эмбеддингов """ import psycopg2 from psycopg2.extras import RealDictCursor from urllib.parse import unquote # Конфигурация DB_CONFIG = { 'host': "147.45.189.234", 'port': 5432, 'database': "default_db", 'user': "gen_user", 'password': unquote("2~~9_%5EkVsU%3F2%5CS") } def get_db_connection(): """Получить подключение к БД""" return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor) def check_progress(): """Проверить прогресс обработки""" try: conn = get_db_connection() cur = conn.cursor() # Общая статистика cur.execute(""" SELECT COUNT(*) as total_chunks, COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels, COUNT(DISTINCT metadata->>'region_name') as processed_regions FROM hotel_website_chunks; """) stats = cur.fetchone() # Статистика по регионам cur.execute(""" SELECT metadata->>'region_name' as region_name, COUNT(DISTINCT metadata->>'hotel_id') as hotels_count, COUNT(*) as chunks_count FROM hotel_website_chunks WHERE metadata->>'region_name' IS NOT NULL GROUP BY metadata->>'region_name' ORDER BY chunks_count DESC; """) regions_stats = cur.fetchall() # Общее количество отелей в системе cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;") total_hotels = cur.fetchone()['total_hotels'] # Общее количество страниц cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;") total_pages = cur.fetchone()['total_pages'] cur.close() conn.close() print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ") print("=" * 50) print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)") print(f"📄 Всего chunks: {stats['total_chunks']}") print(f"🌍 Регионов: {stats['processed_regions']}") print() print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:") print("-" * 50) for region in regions_stats: print(f"🏢 {region['region_name']}:") print(f" Отелей: {region['hotels_count']}") print(f" Chunks: {region['chunks_count']}") print() # Проверяем какие отели еще не обработаны conn = get_db_connection() cur = conn.cursor() cur.execute(""" SELECT h.region_name, COUNT(*) as unprocessed_hotels FROM hotel_main h LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id' WHERE c.id IS NULL AND EXISTS ( SELECT 1 FROM hotel_website_processed p WHERE p.hotel_id = h.id AND p.cleaned_text IS NOT NULL AND LENGTH(p.cleaned_text) > 50 ) GROUP BY h.region_name ORDER BY unprocessed_hotels DESC; """) unprocessed = cur.fetchall() if unprocessed: print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:") print("-" * 50) for region in unprocessed: print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей") else: print("✅ ВСЕ ОТЕЛИ ОБРАБОТАНЫ!") cur.close() conn.close() except Exception as e: print(f"❌ Ошибка: {e}") if __name__ == "__main__": check_progress()