218 lines
8.5 KiB
Python
218 lines
8.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Перекраулинг failed отелей Питера с более мягкими настройками
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|||
|
|
import logging
|
|||
|
|
import sys
|
|||
|
|
from datetime import datetime
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler(sys.stdout)
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': '147.45.189.234',
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': 'default_db',
|
|||
|
|
'user': 'gen_user',
|
|||
|
|
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def normalize_url(url):
|
|||
|
|
"""Нормализовать URL"""
|
|||
|
|
if not url:
|
|||
|
|
return None
|
|||
|
|
url = url.strip()
|
|||
|
|
if not url.startswith(('http://', 'https://')):
|
|||
|
|
# Попробуем сначала https
|
|||
|
|
return f'https://{url}'
|
|||
|
|
return url
|
|||
|
|
|
|||
|
|
def try_http_fallback(url):
|
|||
|
|
"""Попробовать HTTP если HTTPS не работает"""
|
|||
|
|
if url.startswith('https://'):
|
|||
|
|
return url.replace('https://', 'http://')
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def crawl_hotel(hotel_id, hotel_name, website_address):
|
|||
|
|
"""Краулинг одного отеля"""
|
|||
|
|
url = normalize_url(website_address)
|
|||
|
|
if not url:
|
|||
|
|
logging.warning(f" ⚠️ Нет URL")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with sync_playwright() as p:
|
|||
|
|
browser = p.chromium.launch(headless=True)
|
|||
|
|
context = browser.new_context(
|
|||
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|||
|
|
ignore_https_errors=True, # Игнорировать SSL ошибки
|
|||
|
|
java_script_enabled=True
|
|||
|
|
)
|
|||
|
|
page = context.new_page()
|
|||
|
|
|
|||
|
|
# Пробуем HTTPS
|
|||
|
|
try:
|
|||
|
|
logging.info(f" 🌐 Пробуем: {url}")
|
|||
|
|
page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд
|
|||
|
|
html = page.content()
|
|||
|
|
|
|||
|
|
if html and len(html) > 100:
|
|||
|
|
# Успешно!
|
|||
|
|
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
|||
|
|
VALUES (%s, %s, %s, %s)
|
|||
|
|
""", (hotel_id, url, html, datetime.now()))
|
|||
|
|
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
pages_crawled = EXCLUDED.pages_crawled,
|
|||
|
|
total_size_bytes = EXCLUDED.total_size_bytes,
|
|||
|
|
crawl_started_at = EXCLUDED.crawl_started_at,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
|||
|
|
error_message = NULL
|
|||
|
|
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
logging.info(f" ✅ Успешно! {len(html):,} байт")
|
|||
|
|
browser.close()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
# Пробуем HTTP
|
|||
|
|
http_url = try_http_fallback(url)
|
|||
|
|
if http_url:
|
|||
|
|
try:
|
|||
|
|
logging.info(f" 🔄 Пробуем HTTP: {http_url}")
|
|||
|
|
page.goto(http_url, wait_until='domcontentloaded', timeout=60000)
|
|||
|
|
html = page.content()
|
|||
|
|
|
|||
|
|
if html and len(html) > 100:
|
|||
|
|
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
|||
|
|
VALUES (%s, %s, %s, %s)
|
|||
|
|
""", (hotel_id, http_url, html, datetime.now()))
|
|||
|
|
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
pages_crawled = EXCLUDED.pages_crawled,
|
|||
|
|
total_size_bytes = EXCLUDED.total_size_bytes,
|
|||
|
|
crawl_started_at = EXCLUDED.crawl_started_at,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
|||
|
|
error_message = NULL
|
|||
|
|
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
logging.info(f" ✅ HTTP сработал! {len(html):,} байт")
|
|||
|
|
browser.close()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
return True
|
|||
|
|
except Exception as e2:
|
|||
|
|
logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}")
|
|||
|
|
raise e # Вернём оригинальную ошибку
|
|||
|
|
else:
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
browser.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
error_msg = str(e)[:500]
|
|||
|
|
logging.error(f" ❌ Ошибка: {error_msg}")
|
|||
|
|
|
|||
|
|
# Обновить статус как failed
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
error_message = EXCLUDED.error_message,
|
|||
|
|
crawl_started_at = EXCLUDED.crawl_started_at,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at
|
|||
|
|
""", (hotel_id, 'failed', error_msg, datetime.now(), datetime.now()))
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Получить failed отели
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT h.id, h.full_name, h.website_address
|
|||
|
|
FROM hotel_main h
|
|||
|
|
JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
|||
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|||
|
|
AND hwm.crawl_status = 'failed'
|
|||
|
|
ORDER BY h.full_name
|
|||
|
|
""")
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
total = len(hotels)
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
logging.info("=" * 60)
|
|||
|
|
logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА")
|
|||
|
|
logging.info("=" * 60)
|
|||
|
|
logging.info(f"Всего отелей: {total}")
|
|||
|
|
logging.info("")
|
|||
|
|
|
|||
|
|
success = 0
|
|||
|
|
failed = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}")
|
|||
|
|
|
|||
|
|
if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']):
|
|||
|
|
success += 1
|
|||
|
|
else:
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
if i % 10 == 0:
|
|||
|
|
logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок")
|
|||
|
|
|
|||
|
|
logging.info("")
|
|||
|
|
logging.info("=" * 60)
|
|||
|
|
logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН")
|
|||
|
|
logging.info("=" * 60)
|
|||
|
|
logging.info(f"✅ Успешно: {success}")
|
|||
|
|
logging.info(f"❌ Ошибок: {failed}")
|
|||
|
|
logging.info(f"📊 Успех: {success*100//total if total else 0}%")
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|