117 lines
3.6 KiB
Python
117 lines
3.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Перекраулинг отелей со статусом 'failed'
|
|||
|
|
Более мягкие настройки: HTTP fallback, игнорирование SSL ошибок
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'retry_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': '147.45.189.234',
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': 'default_db',
|
|||
|
|
'user': 'gen_user',
|
|||
|
|
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_failed_hotels(region_name=None):
|
|||
|
|
"""Получить отели со статусом failed"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
query = """
|
|||
|
|
SELECT h.id, h.full_name, h.website_address, hwm.error_message
|
|||
|
|
FROM hotel_main h
|
|||
|
|
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
|||
|
|
WHERE hwm.crawl_status = 'failed'
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
if region_name:
|
|||
|
|
query += " AND h.region_name = %s"
|
|||
|
|
cur.execute(query, (region_name,))
|
|||
|
|
else:
|
|||
|
|
cur.execute(query)
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return hotels
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
region = sys.argv[1] if len(sys.argv) > 1 else None
|
|||
|
|
|
|||
|
|
logger.info("=" * 70)
|
|||
|
|
logger.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ")
|
|||
|
|
if region:
|
|||
|
|
logger.info(f"📍 Регион: {region}")
|
|||
|
|
else:
|
|||
|
|
logger.info("📍 Регион: ВСЕ")
|
|||
|
|
logger.info("=" * 70)
|
|||
|
|
|
|||
|
|
# Получаем failed отели
|
|||
|
|
hotels = get_failed_hotels(region)
|
|||
|
|
logger.info(f"\n📊 Найдено {len(hotels)} failed отелей")
|
|||
|
|
|
|||
|
|
if len(hotels) == 0:
|
|||
|
|
logger.info("✅ Нет failed отелей!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Статистика ошибок
|
|||
|
|
errors = {}
|
|||
|
|
for hotel in hotels:
|
|||
|
|
error = hotel['error_message'] or 'Unknown'
|
|||
|
|
error_type = error.split(':')[0] if ':' in error else error
|
|||
|
|
errors[error_type] = errors.get(error_type, 0) + 1
|
|||
|
|
|
|||
|
|
logger.info("\n📊 СТАТИСТИКА ОШИБОК:")
|
|||
|
|
for error_type, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
|
|||
|
|
logger.info(f" {error_type}: {count}")
|
|||
|
|
|
|||
|
|
# Сохраняем список в файл для краулера
|
|||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|||
|
|
filename = f"failed_hotels_{region or 'all'}_{timestamp}.txt"
|
|||
|
|
|
|||
|
|
with open(filename, 'w') as f:
|
|||
|
|
for hotel in hotels:
|
|||
|
|
f.write(f"{hotel['id']}\t{hotel['full_name']}\t{hotel['website_address']}\n")
|
|||
|
|
|
|||
|
|
logger.info(f"\n💾 Список сохранён в: {filename}")
|
|||
|
|
logger.info(f"\n📋 ЗАПУСК КРАУЛЕРА:")
|
|||
|
|
logger.info(f" Можно запустить smart_crawler.py с этим списком")
|
|||
|
|
logger.info(f" Или использовать single_hotel_crawler.py для каждого отеля")
|
|||
|
|
|
|||
|
|
# Выводим первые 10 отелей
|
|||
|
|
logger.info(f"\n📋 ПЕРВЫЕ 10 ОТЕЛЕЙ:")
|
|||
|
|
for i, hotel in enumerate(hotels[:10], 1):
|
|||
|
|
logger.info(f" {i}. {hotel['full_name']}")
|
|||
|
|
logger.info(f" Сайт: {hotel['website_address']}")
|
|||
|
|
logger.info(f" ID: {hotel['id']}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|
|||
|
|
|
|||
|
|
|
|||
|
|
|