#!/usr/bin/env python3 """ Перекраулинг failed отелей Питера с более мягкими настройками """ import psycopg2 from psycopg2.extras import RealDictCursor from urllib.parse import unquote from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout import logging import sys from datetime import datetime import re logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'), logging.StreamHandler(sys.stdout) ] ) DB_CONFIG = { 'host': '147.45.189.234', 'port': 5432, 'database': 'default_db', 'user': 'gen_user', 'password': unquote('2~~9_%5EkVsU%3F2%5CS') } def normalize_url(url): """Нормализовать URL""" if not url: return None url = url.strip() if not url.startswith(('http://', 'https://')): # Попробуем сначала https return f'https://{url}' return url def try_http_fallback(url): """Попробовать HTTP если HTTPS не работает""" if url.startswith('https://'): return url.replace('https://', 'http://') return None def crawl_hotel(hotel_id, hotel_name, website_address): """Краулинг одного отеля""" url = normalize_url(website_address) if not url: logging.warning(f" ⚠️ Нет URL") return False conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor) cur = conn.cursor() try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', ignore_https_errors=True, # Игнорировать SSL ошибки java_script_enabled=True ) page = context.new_page() # Пробуем HTTPS try: logging.info(f" 🌐 Пробуем: {url}") page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд html = page.content() if html and len(html) > 100: # Успешно! cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,)) cur.execute(""" INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at) VALUES (%s, %s, %s, %s) """, (hotel_id, url, html, datetime.now())) cur.execute(""" INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET crawl_status = EXCLUDED.crawl_status, pages_crawled = EXCLUDED.pages_crawled, total_size_bytes = EXCLUDED.total_size_bytes, crawl_started_at = EXCLUDED.crawl_started_at, crawl_finished_at = EXCLUDED.crawl_finished_at, error_message = NULL """, (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now())) conn.commit() logging.info(f" ✅ Успешно! {len(html):,} байт") browser.close() cur.close() conn.close() return True except Exception as e: # Пробуем HTTP http_url = try_http_fallback(url) if http_url: try: logging.info(f" 🔄 Пробуем HTTP: {http_url}") page.goto(http_url, wait_until='domcontentloaded', timeout=60000) html = page.content() if html and len(html) > 100: cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,)) cur.execute(""" INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at) VALUES (%s, %s, %s, %s) """, (hotel_id, http_url, html, datetime.now())) cur.execute(""" INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET crawl_status = EXCLUDED.crawl_status, pages_crawled = EXCLUDED.pages_crawled, total_size_bytes = EXCLUDED.total_size_bytes, crawl_started_at = EXCLUDED.crawl_started_at, crawl_finished_at = EXCLUDED.crawl_finished_at, error_message = NULL """, (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now())) conn.commit() logging.info(f" ✅ HTTP сработал! {len(html):,} байт") browser.close() cur.close() conn.close() return True except Exception as e2: logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}") raise e # Вернём оригинальную ошибку else: raise browser.close() except Exception as e: error_msg = str(e)[:500] logging.error(f" ❌ Ошибка: {error_msg}") # Обновить статус как failed cur.execute(""" INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET crawl_status = EXCLUDED.crawl_status, error_message = EXCLUDED.error_message, crawl_started_at = EXCLUDED.crawl_started_at, crawl_finished_at = EXCLUDED.crawl_finished_at """, (hotel_id, 'failed', error_msg, datetime.now(), datetime.now())) conn.commit() finally: cur.close() conn.close() return False def main(): conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor) cur = conn.cursor() # Получить failed отели cur.execute(""" SELECT h.id, h.full_name, h.website_address FROM hotel_main h JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id WHERE h.region_name = 'г. Санкт-Петербург' AND hwm.crawl_status = 'failed' ORDER BY h.full_name """) hotels = cur.fetchall() total = len(hotels) cur.close() conn.close() logging.info("=" * 60) logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА") logging.info("=" * 60) logging.info(f"Всего отелей: {total}") logging.info("") success = 0 failed = 0 for i, hotel in enumerate(hotels, 1): logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}") if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']): success += 1 else: failed += 1 if i % 10 == 0: logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок") logging.info("") logging.info("=" * 60) logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН") logging.info("=" * 60) logging.info(f"✅ Успешно: {success}") logging.info(f"❌ Ошибок: {failed}") logging.info(f"📊 Успех: {success*100//total if total else 0}%") if __name__ == '__main__': main()