344 lines
11 KiB
Python
344 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Краулер отелей через Browserless API
|
|||
|
|
Использует http://147.45.146.17:3000/function для более надёжного парсинга
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import requests
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
import json
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'browserless_crawler_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация
|
|||
|
|
BROWSERLESS_URL = "http://147.45.146.17:3000/function?token=9ahhnpjkchxtcho9"
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': '147.45.189.234',
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': 'default_db',
|
|||
|
|
'user': 'gen_user',
|
|||
|
|
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# JavaScript функция для Browserless
|
|||
|
|
BROWSER_FUNCTION = """
|
|||
|
|
export default async function ({ page, context }) {
|
|||
|
|
const targetUrl = context.target_url;
|
|||
|
|
|
|||
|
|
// Настройка браузера для обхода блокировок
|
|||
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|||
|
|
await page.setExtraHTTPHeaders({
|
|||
|
|
"Accept-Language": "ru,en;q=0.9",
|
|||
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|||
|
|
"Upgrade-Insecure-Requests": "1",
|
|||
|
|
});
|
|||
|
|
await page.setUserAgent(
|
|||
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
// Попытка загрузки страницы
|
|||
|
|
await page.goto(targetUrl, {
|
|||
|
|
waitUntil: "networkidle2",
|
|||
|
|
timeout: 30000
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// Закрытие cookie баннеров
|
|||
|
|
try {
|
|||
|
|
await page.waitForSelector(
|
|||
|
|
".cookie-accept, .cookie-close, .accept-cookies, [class*='cookie'] button",
|
|||
|
|
{ timeout: 2000 }
|
|||
|
|
);
|
|||
|
|
const btns = await page.$$(
|
|||
|
|
".cookie-accept, .cookie-close, .accept-cookies, [class*='cookie'] button"
|
|||
|
|
);
|
|||
|
|
if (btns[0]) await btns[0].click();
|
|||
|
|
} catch (_) {}
|
|||
|
|
|
|||
|
|
// Ждём загрузки контента
|
|||
|
|
await page.waitForTimeout(1000);
|
|||
|
|
|
|||
|
|
// Извлекаем HTML и метаданные
|
|||
|
|
const data = await page.evaluate(() => {
|
|||
|
|
return {
|
|||
|
|
html: document.documentElement.outerHTML,
|
|||
|
|
title: document.title,
|
|||
|
|
url: window.location.href,
|
|||
|
|
status: 200
|
|||
|
|
};
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
return data;
|
|||
|
|
|
|||
|
|
} catch (error) {
|
|||
|
|
return {
|
|||
|
|
html: null,
|
|||
|
|
title: null,
|
|||
|
|
url: targetUrl,
|
|||
|
|
status: 0,
|
|||
|
|
error: error.message
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_with_browserless(url: str, hotel_id: str) -> dict:
|
|||
|
|
"""Краулинг через Browserless API"""
|
|||
|
|
try:
|
|||
|
|
payload = {
|
|||
|
|
"code": BROWSER_FUNCTION,
|
|||
|
|
"context": {
|
|||
|
|
"target_url": url
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
logger.info(f" 🌐 Отправка запроса в Browserless...")
|
|||
|
|
response = requests.post(
|
|||
|
|
BROWSERLESS_URL,
|
|||
|
|
json=payload,
|
|||
|
|
timeout=60
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.info(f" 📡 Статус: {response.status_code}")
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
result = response.json()
|
|||
|
|
logger.info(f" 📄 Получено: {len(str(result.get('html', '')))} байт")
|
|||
|
|
return result
|
|||
|
|
else:
|
|||
|
|
logger.error(f" ❌ Browserless error: {response.status_code}")
|
|||
|
|
logger.error(f" {response.text[:200]}")
|
|||
|
|
return {"html": None, "error": f"HTTP {response.status_code}"}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ❌ Exception: {e}")
|
|||
|
|
return {"html": None, "error": str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_to_db(hotel_id: str, url: str, result: dict):
|
|||
|
|
"""Сохранение результата в БД"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
try:
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Удаляем старые данные
|
|||
|
|
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
|
|||
|
|
# Сохраняем новые
|
|||
|
|
if result and result.get('html'):
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, page_title, crawled_at, status_code)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
""", (
|
|||
|
|
hotel_id,
|
|||
|
|
result.get('url', url),
|
|||
|
|
result['html'],
|
|||
|
|
result.get('title'),
|
|||
|
|
datetime.now(),
|
|||
|
|
result.get('status', 200)
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
# Обновляем meta
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta
|
|||
|
|
(hotel_id, main_url, pages_crawled, total_size_bytes, crawl_status,
|
|||
|
|
crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
main_url = EXCLUDED.main_url,
|
|||
|
|
pages_crawled = EXCLUDED.pages_crawled,
|
|||
|
|
total_size_bytes = EXCLUDED.total_size_bytes,
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
|||
|
|
error_message = NULL
|
|||
|
|
""", (
|
|||
|
|
hotel_id,
|
|||
|
|
url,
|
|||
|
|
1,
|
|||
|
|
len(result['html']),
|
|||
|
|
'completed',
|
|||
|
|
datetime.now(),
|
|||
|
|
datetime.now()
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
# Ошибка краулинга
|
|||
|
|
error_msg = result.get('error', 'Unknown error') if result else 'No response'
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta
|
|||
|
|
(hotel_id, main_url, crawl_status, error_message,
|
|||
|
|
crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
error_message = EXCLUDED.error_message,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at
|
|||
|
|
""", (
|
|||
|
|
hotel_id,
|
|||
|
|
url,
|
|||
|
|
'failed',
|
|||
|
|
error_msg,
|
|||
|
|
datetime.now(),
|
|||
|
|
datetime.now()
|
|||
|
|
))
|
|||
|
|
conn.commit()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize_url(url: str) -> list:
|
|||
|
|
"""Создаёт список вариантов URL для проверки"""
|
|||
|
|
urls = []
|
|||
|
|
|
|||
|
|
# Убираем пробелы
|
|||
|
|
url = url.strip()
|
|||
|
|
|
|||
|
|
# Если уже есть протокол
|
|||
|
|
if url.startswith('http://') or url.startswith('https://'):
|
|||
|
|
urls.append(url)
|
|||
|
|
# Добавляем альтернативный протокол
|
|||
|
|
if url.startswith('https://'):
|
|||
|
|
urls.append(url.replace('https://', 'http://'))
|
|||
|
|
else:
|
|||
|
|
urls.append(url.replace('http://', 'https://'))
|
|||
|
|
else:
|
|||
|
|
# Пробуем оба варианта
|
|||
|
|
urls.append(f"https://{url}")
|
|||
|
|
urls.append(f"http://{url}")
|
|||
|
|
|
|||
|
|
# Убираем www если есть, или добавляем если нет
|
|||
|
|
if url.startswith('www.'):
|
|||
|
|
url_no_www = url[4:]
|
|||
|
|
urls.append(f"https://{url_no_www}")
|
|||
|
|
urls.append(f"http://{url_no_www}")
|
|||
|
|
else:
|
|||
|
|
urls.append(f"https://www.{url}")
|
|||
|
|
urls.append(f"http://www.{url}")
|
|||
|
|
|
|||
|
|
return urls
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_failed_hotels(region_name=None, limit=None):
|
|||
|
|
"""Обработка failed отелей"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Получаем failed отели
|
|||
|
|
query = """
|
|||
|
|
SELECT h.id, h.full_name, h.website_address
|
|||
|
|
FROM hotel_main h
|
|||
|
|
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
|||
|
|
WHERE hwm.crawl_status = 'failed'
|
|||
|
|
AND h.website_address IS NOT NULL
|
|||
|
|
AND h.website_address != ''
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
if region_name:
|
|||
|
|
query += " AND h.region_name = %s"
|
|||
|
|
cur.execute(query, (region_name,))
|
|||
|
|
else:
|
|||
|
|
cur.execute(query)
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
|
|||
|
|
if limit:
|
|||
|
|
hotels = hotels[:limit]
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
logger.info("=" * 70)
|
|||
|
|
logger.info("🚀 BROWSERLESS КРАУЛЕР")
|
|||
|
|
if region_name:
|
|||
|
|
logger.info(f"📍 Регион: {region_name}")
|
|||
|
|
logger.info(f"📊 Отелей для обработки: {len(hotels)}")
|
|||
|
|
logger.info("=" * 70)
|
|||
|
|
|
|||
|
|
success = 0
|
|||
|
|
failed = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
try:
|
|||
|
|
logger.info(f"\n[{i}/{len(hotels)}] {hotel['full_name']}")
|
|||
|
|
logger.info(f" URL: {hotel['website_address']}")
|
|||
|
|
|
|||
|
|
# Получаем все варианты URL
|
|||
|
|
url_variants = normalize_url(hotel['website_address'])
|
|||
|
|
logger.info(f" 🔄 Пробуем {len(url_variants)} вариантов URL")
|
|||
|
|
|
|||
|
|
result = None
|
|||
|
|
working_url = None
|
|||
|
|
|
|||
|
|
# Пробуем все варианты URL
|
|||
|
|
for variant in url_variants:
|
|||
|
|
logger.info(f" 🌐 Пробую: {variant}")
|
|||
|
|
result = crawl_with_browserless(variant, hotel['id'])
|
|||
|
|
|
|||
|
|
# Если получили HTML - успех!
|
|||
|
|
if result and result.get('html') and result.get('html') != 'null':
|
|||
|
|
working_url = variant
|
|||
|
|
logger.info(f" ✅ Рабочий URL найден!")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# Небольшая задержка между попытками
|
|||
|
|
time.sleep(0.5)
|
|||
|
|
|
|||
|
|
# Сохраняем результат
|
|||
|
|
if working_url and result:
|
|||
|
|
if save_to_db(hotel['id'], working_url, result):
|
|||
|
|
logger.info(" ✅ Успешно спарсено и сохранено")
|
|||
|
|
success += 1
|
|||
|
|
else:
|
|||
|
|
logger.info(" ⚠️ Спарсено но не сохранено")
|
|||
|
|
failed += 1
|
|||
|
|
else:
|
|||
|
|
logger.info(" ❌ Все варианты URL не сработали")
|
|||
|
|
# Сохраняем failed статус
|
|||
|
|
save_to_db(hotel['id'], hotel['website_address'],
|
|||
|
|
{"html": None, "error": "All URL variants failed"})
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
# Задержка между отелями
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" 💥 КРИТИЧЕСКАЯ ОШИБКА: {e}")
|
|||
|
|
failed += 1
|
|||
|
|
# Продолжаем работу даже при ошибке
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
logger.info("\n" + "=" * 70)
|
|||
|
|
logger.info(f"✅ Успешно: {success}")
|
|||
|
|
logger.info(f"❌ Ошибок: {failed}")
|
|||
|
|
logger.info("=" * 70)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
region = sys.argv[1] if len(sys.argv) > 1 else None
|
|||
|
|
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
|
|||
|
|
|
|||
|
|
process_failed_hotels(region, limit)
|
|||
|
|
|