Files
hotels/check_progress.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

118 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Скрипт для проверки прогресса обработки эмбеддингов
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
# Конфигурация
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
def get_db_connection():
"""Получить подключение к БД"""
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
def check_progress():
"""Проверить прогресс обработки"""
try:
conn = get_db_connection()
cur = conn.cursor()
# Общая статистика
cur.execute("""
SELECT
COUNT(*) as total_chunks,
COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels,
COUNT(DISTINCT metadata->>'region_name') as processed_regions
FROM hotel_website_chunks;
""")
stats = cur.fetchone()
# Статистика по регионам
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions_stats = cur.fetchall()
# Общее количество отелей в системе
cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;")
total_hotels = cur.fetchone()['total_hotels']
# Общее количество страниц
cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;")
total_pages = cur.fetchone()['total_pages']
cur.close()
conn.close()
print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ")
print("=" * 50)
print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)")
print(f"📄 Всего chunks: {stats['total_chunks']}")
print(f"🌍 Регионов: {stats['processed_regions']}")
print()
print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:")
print("-" * 50)
for region in regions_stats:
print(f"🏢 {region['region_name']}:")
print(f" Отелей: {region['hotels_count']}")
print(f" Chunks: {region['chunks_count']}")
print()
# Проверяем какие отели еще не обработаны
conn = get_db_connection()
cur = conn.cursor()
cur.execute("""
SELECT
h.region_name,
COUNT(*) as unprocessed_hotels
FROM hotel_main h
LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id'
WHERE c.id IS NULL
AND EXISTS (
SELECT 1 FROM hotel_website_processed p
WHERE p.hotel_id = h.id
AND p.cleaned_text IS NOT NULL
AND LENGTH(p.cleaned_text) > 50
)
GROUP BY h.region_name
ORDER BY unprocessed_hotels DESC;
""")
unprocessed = cur.fetchall()
if unprocessed:
print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:")
print("-" * 50)
for region in unprocessed:
print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей")
else:
print("ВСЕ ОТЕЛИ ОБРАБОТАНЫ!")
cur.close()
conn.close()
except Exception as e:
print(f"❌ Ошибка: {e}")
if __name__ == "__main__":
check_progress()