118 lines
4.1 KiB
Python
118 lines
4.1 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Скрипт для проверки прогресса обработки эмбеддингов
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
|
|||
|
|
# Конфигурация
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def get_db_connection():
|
|||
|
|
"""Получить подключение к БД"""
|
|||
|
|
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
|
|||
|
|
def check_progress():
|
|||
|
|
"""Проверить прогресс обработки"""
|
|||
|
|
try:
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Общая статистика
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
COUNT(*) as total_chunks,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels,
|
|||
|
|
COUNT(DISTINCT metadata->>'region_name') as processed_regions
|
|||
|
|
FROM hotel_website_chunks;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
stats = cur.fetchone()
|
|||
|
|
|
|||
|
|
# Статистика по регионам
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
|||
|
|
COUNT(*) as chunks_count
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE metadata->>'region_name' IS NOT NULL
|
|||
|
|
GROUP BY metadata->>'region_name'
|
|||
|
|
ORDER BY chunks_count DESC;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
regions_stats = cur.fetchall()
|
|||
|
|
|
|||
|
|
# Общее количество отелей в системе
|
|||
|
|
cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;")
|
|||
|
|
total_hotels = cur.fetchone()['total_hotels']
|
|||
|
|
|
|||
|
|
# Общее количество страниц
|
|||
|
|
cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;")
|
|||
|
|
total_pages = cur.fetchone()['total_pages']
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ")
|
|||
|
|
print("=" * 50)
|
|||
|
|
print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)")
|
|||
|
|
print(f"📄 Всего chunks: {stats['total_chunks']}")
|
|||
|
|
print(f"🌍 Регионов: {stats['processed_regions']}")
|
|||
|
|
print()
|
|||
|
|
print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:")
|
|||
|
|
print("-" * 50)
|
|||
|
|
|
|||
|
|
for region in regions_stats:
|
|||
|
|
print(f"🏢 {region['region_name']}:")
|
|||
|
|
print(f" Отелей: {region['hotels_count']}")
|
|||
|
|
print(f" Chunks: {region['chunks_count']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# Проверяем какие отели еще не обработаны
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
h.region_name,
|
|||
|
|
COUNT(*) as unprocessed_hotels
|
|||
|
|
FROM hotel_main h
|
|||
|
|
LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id'
|
|||
|
|
WHERE c.id IS NULL
|
|||
|
|
AND EXISTS (
|
|||
|
|
SELECT 1 FROM hotel_website_processed p
|
|||
|
|
WHERE p.hotel_id = h.id
|
|||
|
|
AND p.cleaned_text IS NOT NULL
|
|||
|
|
AND LENGTH(p.cleaned_text) > 50
|
|||
|
|
)
|
|||
|
|
GROUP BY h.region_name
|
|||
|
|
ORDER BY unprocessed_hotels DESC;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
unprocessed = cur.fetchall()
|
|||
|
|
|
|||
|
|
if unprocessed:
|
|||
|
|
print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:")
|
|||
|
|
print("-" * 50)
|
|||
|
|
for region in unprocessed:
|
|||
|
|
print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей")
|
|||
|
|
else:
|
|||
|
|
print("✅ ВСЕ ОТЕЛИ ОБРАБОТАНЫ!")
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Ошибка: {e}")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
check_progress()
|