69 lines
2.5 KiB
Python
69 lines
2.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
import psycopg2
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
# Подключение к БД (используем тот же пароль что в mass_crawler.py)
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
conn = psycopg2.connect(
|
|||
|
|
host='147.45.189.234',
|
|||
|
|
database='default_db',
|
|||
|
|
user='gen_user',
|
|||
|
|
password=unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Общее количество отелей
|
|||
|
|
cur.execute('SELECT COUNT(*) FROM hotel_main')
|
|||
|
|
total_hotels = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
# Отели с сайтами
|
|||
|
|
cur.execute('SELECT COUNT(DISTINCT hotel_id) FROM hotel_website_raw')
|
|||
|
|
hotels_with_raw = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
# Отели с обработанными данными
|
|||
|
|
cur.execute('SELECT COUNT(DISTINCT hotel_id) FROM hotel_website_processed')
|
|||
|
|
hotels_with_processed = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
# Общее количество страниц
|
|||
|
|
cur.execute('SELECT COUNT(*) FROM hotel_website_raw')
|
|||
|
|
total_raw_pages = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
cur.execute('SELECT COUNT(*) FROM hotel_website_processed')
|
|||
|
|
total_processed_pages = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
# Недавно обработанные отели (за последние 24 часа)
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT COUNT(DISTINCT hotel_id)
|
|||
|
|
FROM hotel_website_processed
|
|||
|
|
WHERE processed_at > NOW() - INTERVAL '24 hours'
|
|||
|
|
""")
|
|||
|
|
recently_processed = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
print(f'📊 СТАТИСТИКА ПАРСИНГА:')
|
|||
|
|
print(f' 🏨 Всего отелей: {total_hotels}')
|
|||
|
|
print(f' 🌐 Отелей с raw данными: {hotels_with_raw}')
|
|||
|
|
print(f' ✅ Отелей с processed данными: {hotels_with_processed}')
|
|||
|
|
print(f' 📄 Всего raw страниц: {total_raw_pages:,}')
|
|||
|
|
print(f' 📄 Всего processed страниц: {total_processed_pages:,}')
|
|||
|
|
print(f' ⏰ За последние 24ч: {recently_processed}')
|
|||
|
|
print(f' 📈 Общий прогресс: {hotels_with_processed}/{total_hotels} ({hotels_with_processed/total_hotels*100:.1f}%)')
|
|||
|
|
|
|||
|
|
# Проверим активность краулера
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT hotel_id, COUNT(*) as pages_count, MAX(processed_at) as last_update
|
|||
|
|
FROM hotel_website_processed
|
|||
|
|
WHERE processed_at > NOW() - INTERVAL '1 hour'
|
|||
|
|
GROUP BY hotel_id
|
|||
|
|
ORDER BY last_update DESC
|
|||
|
|
LIMIT 5
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
recent_hotels = cur.fetchall()
|
|||
|
|
if recent_hotels:
|
|||
|
|
print(f'\n🔄 ПОСЛЕДНИЕ ОБРАБОТАННЫЕ ОТЕЛИ (за час):')
|
|||
|
|
for hotel_id, pages_count, last_update in recent_hotels:
|
|||
|
|
print(f' {hotel_id}: {pages_count} страниц в {last_update.strftime("%H:%M:%S")}')
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|