Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
117
check_progress.py
Normal file
117
check_progress.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Скрипт для проверки прогресса обработки эмбеддингов
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from urllib.parse import unquote
|
||||
|
||||
# Конфигурация
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
def get_db_connection():
|
||||
"""Получить подключение к БД"""
|
||||
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
|
||||
def check_progress():
|
||||
"""Проверить прогресс обработки"""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
# Общая статистика
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_chunks,
|
||||
COUNT(DISTINCT metadata->>'hotel_id') as processed_hotels,
|
||||
COUNT(DISTINCT metadata->>'region_name') as processed_regions
|
||||
FROM hotel_website_chunks;
|
||||
""")
|
||||
|
||||
stats = cur.fetchone()
|
||||
|
||||
# Статистика по регионам
|
||||
cur.execute("""
|
||||
SELECT
|
||||
metadata->>'region_name' as region_name,
|
||||
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
||||
COUNT(*) as chunks_count
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'region_name' IS NOT NULL
|
||||
GROUP BY metadata->>'region_name'
|
||||
ORDER BY chunks_count DESC;
|
||||
""")
|
||||
|
||||
regions_stats = cur.fetchall()
|
||||
|
||||
# Общее количество отелей в системе
|
||||
cur.execute("SELECT COUNT(*) as total_hotels FROM hotel_main;")
|
||||
total_hotels = cur.fetchone()['total_hotels']
|
||||
|
||||
# Общее количество страниц
|
||||
cur.execute("SELECT COUNT(*) as total_pages FROM hotel_website_processed;")
|
||||
total_pages = cur.fetchone()['total_pages']
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print("📊 ПРОГРЕСС ОБРАБОТКИ ЭМБЕДДИНГОВ")
|
||||
print("=" * 50)
|
||||
print(f"🏨 Обработано отелей: {stats['processed_hotels']}/{total_hotels} ({stats['processed_hotels']/total_hotels*100:.1f}%)")
|
||||
print(f"📄 Всего chunks: {stats['total_chunks']}")
|
||||
print(f"🌍 Регионов: {stats['processed_regions']}")
|
||||
print()
|
||||
print("📈 ДЕТАЛЬНАЯ СТАТИСТИКА ПО РЕГИОНАМ:")
|
||||
print("-" * 50)
|
||||
|
||||
for region in regions_stats:
|
||||
print(f"🏢 {region['region_name']}:")
|
||||
print(f" Отелей: {region['hotels_count']}")
|
||||
print(f" Chunks: {region['chunks_count']}")
|
||||
print()
|
||||
|
||||
# Проверяем какие отели еще не обработаны
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT
|
||||
h.region_name,
|
||||
COUNT(*) as unprocessed_hotels
|
||||
FROM hotel_main h
|
||||
LEFT JOIN hotel_website_chunks c ON h.id::text = c.metadata->>'hotel_id'
|
||||
WHERE c.id IS NULL
|
||||
AND EXISTS (
|
||||
SELECT 1 FROM hotel_website_processed p
|
||||
WHERE p.hotel_id = h.id
|
||||
AND p.cleaned_text IS NOT NULL
|
||||
AND LENGTH(p.cleaned_text) > 50
|
||||
)
|
||||
GROUP BY h.region_name
|
||||
ORDER BY unprocessed_hotels DESC;
|
||||
""")
|
||||
|
||||
unprocessed = cur.fetchall()
|
||||
|
||||
if unprocessed:
|
||||
print("⏳ ОСТАЛОСЬ ОБРАБОТАТЬ:")
|
||||
print("-" * 50)
|
||||
for region in unprocessed:
|
||||
print(f"🏢 {region['region_name']}: {region['unprocessed_hotels']} отелей")
|
||||
else:
|
||||
print("✅ ВСЕ ОТЕЛИ ОБРАБОТАНЫ!")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_progress()
|
||||
Reference in New Issue
Block a user