Files
hotels/check_progress.py

118 lines
4.1 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
Скрипт для проверки прогресса обработки эмбеддингов
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
# Конфигурация
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
def get_db_connection():
"""Получить подключение к БД"""
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
def check_progress():
"""Проверить прогресс обработки"""
try:
conn = get_db_connection()
cur = conn.cursor()
# Общая статистика
cur.execute("""
SELECT
COUNT(*) as total_chunks,
COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels,
COUNT(DISTINCT metadata->>'region_name') as processed_regions
FROM hotel_website_chunks;
""")
stats = cur.fetchone()
# Статистика по регионам
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions_stats = cur.fetchall()
# Общее количество отелей в системе
cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;")
total_hotels = cur.fetchone()['total_hotels']
# Общее количество страниц
cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;")
total_pages = cur.fetchone()['total_pages']
cur.close()
conn.close()
print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ")
print("=" * 50)
print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)")
print(f"📄 Всего chunks: {stats['total_chunks']}")
print(f"🌍 Регионов: {stats['processed_regions']}")
print()
print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:")
print("-" * 50)
for region in regions_stats:
print(f"🏢 {region['region_name']}:")
print(f" Отелей: {region['hotels_count']}")
print(f" Chunks: {region['chunks_count']}")
print()
# Проверяем какие отели еще не обработаны
conn = get_db_connection()
cur = conn.cursor()
cur.execute("""
SELECT
h.region_name,
COUNT(*) as unprocessed_hotels
FROM hotel_main h
LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id'
WHERE c.id IS NULL
AND EXISTS (
SELECT 1 FROM hotel_website_processed p
WHERE p.hotel_id = h.id
AND p.cleaned_text IS NOT NULL
AND LENGTH(p.cleaned_text) > 50
)
GROUP BY h.region_name
ORDER BY unprocessed_hotels DESC;
""")
unprocessed = cur.fetchall()
if unprocessed:
print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:")
print("-" * 50)
for region in unprocessed:
print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей")
else:
print("ВСЕ ОТЕЛИ ОБРАБОТАНЫ!")
cur.close()
conn.close()
except Exception as e:
print(f"❌ Ошибка: {e}")
if __name__ == "__main__":
check_progress()