#!/usr/bin/env python3 """ Многопоточный краулер отелей через Browserless API """ import requests import psycopg2 from psycopg2.extras import RealDictCursor from urllib.parse import unquote import logging from datetime import datetime import time from concurrent.futures import ThreadPoolExecutor, as_completed import threading # Настройка логирования logging.basicConfig( level=logging.INFO, format='%(asctime)s - [%(threadName)s] - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f'browserless_parallel_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # Конфигурация BROWSERLESS_URL = "http://147.45.146.17:3000/function?token=9ahhnpjkchxtcho9" MAX_WORKERS = 5 # Количество параллельных потоков DB_CONFIG = { 'host': '147.45.189.234', 'port': 5432, 'database': 'default_db', 'user': 'gen_user', 'password': unquote('2~~9_%5EkVsU%3F2%5CS') } # Счётчики (потокобезопасные) stats_lock = threading.Lock() stats = {'success': 0, 'failed': 0, 'processed': 0} # JavaScript функция для Browserless BROWSER_FUNCTION = """ export default async function ({ page, context }) { const targetUrl = context.target_url; await page.setViewport({ width: 1920, height: 1080 }); await page.setExtraHTTPHeaders({ "Accept-Language": "ru,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Upgrade-Insecure-Requests": "1", }); await page.setUserAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ); try { await page.goto(targetUrl, { waitUntil: "networkidle2", timeout: 30000 }); try { await page.waitForSelector( ".cookie-accept, .cookie-close, [class*='cookie'] button", { timeout: 2000 } ); const btns = await page.$$(".cookie-accept, .cookie-close, [class*='cookie'] button"); if (btns[0]) await btns[0].click(); } catch (_) {} await page.waitForTimeout(1000); const data = await page.evaluate(() => { return { html: document.documentElement.outerHTML, title: document.title, url: window.location.href, status: 200 }; }); return data; } catch (error) { return { html: null, title: null, url: targetUrl, status: 0, error: error.message }; } } """ def normalize_url(url: str) -> list: """Создаёт список вариантов URL""" urls = [] url = url.strip() if url.startswith('http://') or url.startswith('https://'): urls.append(url) if url.startswith('https://'): urls.append(url.replace('https://', 'http://')) else: urls.append(url.replace('http://', 'https://')) else: urls.append(f"https://{url}") urls.append(f"http://{url}") if url.startswith('www.'): url_no_www = url[4:] urls.append(f"https://{url_no_www}") urls.append(f"http://{url_no_www}") else: urls.append(f"https://www.{url}") urls.append(f"http://www.{url}") return urls def crawl_with_browserless(url: str) -> dict: """Краулинг через Browserless API""" try: payload = { "code": BROWSER_FUNCTION, "context": {"target_url": url} } response = requests.post(BROWSERLESS_URL, json=payload, timeout=60) if response.status_code == 200: result = response.json() return result else: return {"html": None, "error": f"HTTP {response.status_code}"} except Exception as e: return {"html": None, "error": str(e)} def save_to_db(hotel_id: str, url: str, result: dict): """Сохранение в БД (с отдельным подключением для потока)""" conn = psycopg2.connect(**DB_CONFIG) try: cur = conn.cursor() cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,)) if result and result.get('html') and result.get('html') != 'null': cur.execute(""" INSERT INTO hotel_website_raw (hotel_id, url, html, page_title, crawled_at, status_code) VALUES (%s, %s, %s, %s, %s, %s) """, ( hotel_id, result.get('url', url), result['html'], result.get('title'), datetime.now(), result.get('status', 200) )) cur.execute(""" INSERT INTO hotel_website_meta (hotel_id, main_url, pages_crawled, total_size_bytes, crawl_status, crawl_started_at, crawl_finished_at) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET main_url = EXCLUDED.main_url, pages_crawled = EXCLUDED.pages_crawled, total_size_bytes = EXCLUDED.total_size_bytes, crawl_status = EXCLUDED.crawl_status, crawl_finished_at = EXCLUDED.crawl_finished_at, error_message = NULL """, ( hotel_id, url, 1, len(result['html']), 'completed', datetime.now(), datetime.now() )) conn.commit() return True else: error_msg = result.get('error', 'No HTML') if result else 'No response' cur.execute(""" INSERT INTO hotel_website_meta (hotel_id, main_url, crawl_status, error_message, crawl_started_at, crawl_finished_at) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET crawl_status = EXCLUDED.crawl_status, error_message = EXCLUDED.error_message, crawl_finished_at = EXCLUDED.crawl_finished_at """, (hotel_id, url, 'failed', error_msg, datetime.now(), datetime.now())) conn.commit() return False finally: cur.close() conn.close() def process_hotel(hotel: dict, total: int, index: int): """Обработка одного отеля""" try: logger.info(f"[{index}/{total}] {hotel['full_name'][:50]}") url_variants = normalize_url(hotel['website_address']) result = None working_url = None for variant in url_variants: result = crawl_with_browserless(variant) if result and result.get('html') and result.get('html') != 'null': working_url = variant logger.info(f" ✅ Найден: {variant}") break time.sleep(0.3) if working_url and result: if save_to_db(hotel['id'], working_url, result): with stats_lock: stats['success'] += 1 stats['processed'] += 1 return True save_to_db(hotel['id'], hotel['website_address'], {"html": None, "error": "All variants failed"}) with stats_lock: stats['failed'] += 1 stats['processed'] += 1 return False except Exception as e: logger.error(f" ❌ Ошибка: {e}") with stats_lock: stats['failed'] += 1 stats['processed'] += 1 return False def main(): import sys region = sys.argv[1] if len(sys.argv) > 1 and sys.argv[1] != 'None' else None limit = int(sys.argv[2]) if len(sys.argv) > 2 and sys.argv[2] != 'None' else None workers = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3] != 'None' else MAX_WORKERS # Получаем список отелей conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor) cur = conn.cursor() query = """ SELECT h.id, h.full_name, h.website_address FROM hotel_main h INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id WHERE hwm.crawl_status = 'failed' AND h.website_address IS NOT NULL AND h.website_address != '' """ if region: query += " AND h.region_name = %s" cur.execute(query, (region,)) else: cur.execute(query) hotels = cur.fetchall() if limit: hotels = hotels[:limit] cur.close() conn.close() logger.info("=" * 70) logger.info("🚀 МНОГОПОТОЧНЫЙ BROWSERLESS КРАУЛЕР") if region: logger.info(f"📍 Регион: {region}") logger.info(f"📊 Отелей: {len(hotels)}") logger.info(f"🔧 Потоков: {workers}") logger.info("=" * 70) start_time = time.time() # Многопоточная обработка with ThreadPoolExecutor(max_workers=workers) as executor: futures = { executor.submit(process_hotel, hotel, len(hotels), i): hotel for i, hotel in enumerate(hotels, 1) } for future in as_completed(futures): try: future.result() # Промежуточная статистика каждые 50 отелей if stats['processed'] % 50 == 0: elapsed = time.time() - start_time rate = stats['processed'] / elapsed if elapsed > 0 else 0 remaining = (len(hotels) - stats['processed']) / rate if rate > 0 else 0 logger.info("") logger.info("📊 ПРОМЕЖУТОЧНАЯ СТАТИСТИКА:") logger.info(f" Обработано: {stats['processed']}/{len(hotels)}") logger.info(f" Успешно: {stats['success']}") logger.info(f" Ошибок: {stats['failed']}") logger.info(f" Скорость: {rate:.2f} отелей/сек") logger.info(f" Осталось: ~{remaining/60:.1f} мин") logger.info("") except Exception as e: logger.error(f"Future error: {e}") elapsed = time.time() - start_time logger.info("\n" + "=" * 70) logger.info("✅ ЗАВЕРШЕНО!") logger.info(f" Успешно: {stats['success']}") logger.info(f" Ошибок: {stats['failed']}") logger.info(f" Время: {elapsed/60:.1f} мин") logger.info(f" Скорость: {len(hotels)/elapsed:.2f} отелей/сек") logger.info("=" * 70) if __name__ == "__main__": main()