- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
118 lines
4.1 KiB
Python
118 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Скрипт для проверки прогресса обработки эмбеддингов
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from urllib.parse import unquote
|
||
|
||
# Конфигурация
|
||
DB_CONFIG = {
|
||
'host': "147.45.189.234",
|
||
'port': 5432,
|
||
'database': "default_db",
|
||
'user': "gen_user",
|
||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||
}
|
||
|
||
def get_db_connection():
|
||
"""Получить подключение к БД"""
|
||
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
|
||
def check_progress():
|
||
"""Проверить прогресс обработки"""
|
||
try:
|
||
conn = get_db_connection()
|
||
cur = conn.cursor()
|
||
|
||
# Общая статистика
|
||
cur.execute("""
|
||
SELECT
|
||
COUNT(*) as total_chunks,
|
||
COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels,
|
||
COUNT(DISTINCT metadata->>'region_name') as processed_regions
|
||
FROM hotel_website_chunks;
|
||
""")
|
||
|
||
stats = cur.fetchone()
|
||
|
||
# Статистика по регионам
|
||
cur.execute("""
|
||
SELECT
|
||
metadata->>'region_name' as region_name,
|
||
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
||
COUNT(*) as chunks_count
|
||
FROM hotel_website_chunks
|
||
WHERE metadata->>'region_name' IS NOT NULL
|
||
GROUP BY metadata->>'region_name'
|
||
ORDER BY chunks_count DESC;
|
||
""")
|
||
|
||
regions_stats = cur.fetchall()
|
||
|
||
# Общее количество отелей в системе
|
||
cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;")
|
||
total_hotels = cur.fetchone()['total_hotels']
|
||
|
||
# Общее количество страниц
|
||
cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;")
|
||
total_pages = cur.fetchone()['total_pages']
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ")
|
||
print("=" * 50)
|
||
print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)")
|
||
print(f"📄 Всего chunks: {stats['total_chunks']}")
|
||
print(f"🌍 Регионов: {stats['processed_regions']}")
|
||
print()
|
||
print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:")
|
||
print("-" * 50)
|
||
|
||
for region in regions_stats:
|
||
print(f"🏢 {region['region_name']}:")
|
||
print(f" Отелей: {region['hotels_count']}")
|
||
print(f" Chunks: {region['chunks_count']}")
|
||
print()
|
||
|
||
# Проверяем какие отели еще не обработаны
|
||
conn = get_db_connection()
|
||
cur = conn.cursor()
|
||
cur.execute("""
|
||
SELECT
|
||
h.region_name,
|
||
COUNT(*) as unprocessed_hotels
|
||
FROM hotel_main h
|
||
LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id'
|
||
WHERE c.id IS NULL
|
||
AND EXISTS (
|
||
SELECT 1 FROM hotel_website_processed p
|
||
WHERE p.hotel_id = h.id
|
||
AND p.cleaned_text IS NOT NULL
|
||
AND LENGTH(p.cleaned_text) > 50
|
||
)
|
||
GROUP BY h.region_name
|
||
ORDER BY unprocessed_hotels DESC;
|
||
""")
|
||
|
||
unprocessed = cur.fetchall()
|
||
|
||
if unprocessed:
|
||
print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:")
|
||
print("-" * 50)
|
||
for region in unprocessed:
|
||
print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей")
|
||
else:
|
||
print("✅ ВСЕ ОТЕЛИ ОБРАБОТАНЫ!")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
except Exception as e:
|
||
print(f"❌ Ошибка: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
check_progress()
|