Files
hotels/browserless_crawler.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

344 lines
11 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Краулер отелей через Browserless API
Использует http://147.45.146.17:3000/function для более надёжного парсинга
"""
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import logging
from datetime import datetime
import json
import time
# Настройка логирования
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'browserless_crawler_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Конфигурация
BROWSERLESS_URL = "http://147.45.146.17:3000/function?token=9ahhnpjkchxtcho9"
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
# JavaScript функция для Browserless
BROWSER_FUNCTION = """
export default async function ({ page, context }) {
const targetUrl = context.target_url;
// Настройка браузера для обхода блокировок
await page.setViewport({ width: 1920, height: 1080 });
await page.setExtraHTTPHeaders({
"Accept-Language": "ru,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Upgrade-Insecure-Requests": "1",
});
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
);
try {
// Попытка загрузки страницы
await page.goto(targetUrl, {
waitUntil: "networkidle2",
timeout: 30000
});
// Закрытие cookie баннеров
try {
await page.waitForSelector(
".cookie-accept, .cookie-close, .accept-cookies, [class*='cookie'] button",
{ timeout: 2000 }
);
const btns = await page.$$(
".cookie-accept, .cookie-close, .accept-cookies, [class*='cookie'] button"
);
if (btns[0]) await btns[0].click();
} catch (_) {}
// Ждём загрузки контента
await page.waitForTimeout(1000);
// Извлекаем HTML и метаданные
const data = await page.evaluate(() => {
return {
html: document.documentElement.outerHTML,
title: document.title,
url: window.location.href,
status: 200
};
});
return data;
} catch (error) {
return {
html: null,
title: null,
url: targetUrl,
status: 0,
error: error.message
};
}
}
"""
def crawl_with_browserless(url: str, hotel_id: str) -> dict:
"""Краулинг через Browserless API"""
try:
payload = {
"code": BROWSER_FUNCTION,
"context": {
"target_url": url
}
}
logger.info(f" 🌐 Отправка запроса в Browserless...")
response = requests.post(
BROWSERLESS_URL,
json=payload,
timeout=60
)
logger.info(f" 📡 Статус: {response.status_code}")
if response.status_code == 200:
result = response.json()
logger.info(f" 📄 Получено: {len(str(result.get('html', '')))} байт")
return result
else:
logger.error(f" ❌ Browserless error: {response.status_code}")
logger.error(f" {response.text[:200]}")
return {"html": None, "error": f"HTTP {response.status_code}"}
except Exception as e:
logger.error(f" ❌ Exception: {e}")
return {"html": None, "error": str(e)}
def save_to_db(hotel_id: str, url: str, result: dict):
"""Сохранение результата в БД"""
conn = psycopg2.connect(**DB_CONFIG)
try:
cur = conn.cursor()
# Удаляем старые данные
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
# Сохраняем новые
if result and result.get('html'):
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, page_title, crawled_at, status_code)
VALUES (%s, %s, %s, %s, %s, %s)
""", (
hotel_id,
result.get('url', url),
result['html'],
result.get('title'),
datetime.now(),
result.get('status', 200)
))
# Обновляем meta
cur.execute("""
INSERT INTO hotel_website_meta
(hotel_id, main_url, pages_crawled, total_size_bytes, crawl_status,
crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
main_url = EXCLUDED.main_url,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_status = EXCLUDED.crawl_status,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (
hotel_id,
url,
1,
len(result['html']),
'completed',
datetime.now(),
datetime.now()
))
conn.commit()
return True
else:
# Ошибка краулинга
error_msg = result.get('error', 'Unknown error') if result else 'No response'
cur.execute("""
INSERT INTO hotel_website_meta
(hotel_id, main_url, crawl_status, error_message,
crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (
hotel_id,
url,
'failed',
error_msg,
datetime.now(),
datetime.now()
))
conn.commit()
return False
finally:
cur.close()
conn.close()
def normalize_url(url: str) -> list:
"""Создаёт список вариантов URL для проверки"""
urls = []
# Убираем пробелы
url = url.strip()
# Если уже есть протокол
if url.startswith('http://') or url.startswith('https://'):
urls.append(url)
# Добавляем альтернативный протокол
if url.startswith('https://'):
urls.append(url.replace('https://', 'http://'))
else:
urls.append(url.replace('http://', 'https://'))
else:
# Пробуем оба варианта
urls.append(f"https://{url}")
urls.append(f"http://{url}")
# Убираем www если есть, или добавляем если нет
if url.startswith('www.'):
url_no_www = url[4:]
urls.append(f"https://{url_no_www}")
urls.append(f"http://{url_no_www}")
else:
urls.append(f"https://www.{url}")
urls.append(f"http://www.{url}")
return urls
def process_failed_hotels(region_name=None, limit=None):
"""Обработка failed отелей"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
# Получаем failed отели
query = """
SELECT h.id, h.full_name, h.website_address
FROM hotel_main h
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
WHERE hwm.crawl_status = 'failed'
AND h.website_address IS NOT NULL
AND h.website_address != ''
"""
if region_name:
query += " AND h.region_name = %s"
cur.execute(query, (region_name,))
else:
cur.execute(query)
hotels = cur.fetchall()
if limit:
hotels = hotels[:limit]
cur.close()
conn.close()
logger.info("=" * 70)
logger.info("🚀 BROWSERLESS КРАУЛЕР")
if region_name:
logger.info(f"📍 Регион: {region_name}")
logger.info(f"📊 Отелей для обработки: {len(hotels)}")
logger.info("=" * 70)
success = 0
failed = 0
for i, hotel in enumerate(hotels, 1):
try:
logger.info(f"\n[{i}/{len(hotels)}] {hotel['full_name']}")
logger.info(f" URL: {hotel['website_address']}")
# Получаем все варианты URL
url_variants = normalize_url(hotel['website_address'])
logger.info(f" 🔄 Пробуем {len(url_variants)} вариантов URL")
result = None
working_url = None
# Пробуем все варианты URL
for variant in url_variants:
logger.info(f" 🌐 Пробую: {variant}")
result = crawl_with_browserless(variant, hotel['id'])
# Если получили HTML - успех!
if result and result.get('html') and result.get('html') != 'null':
working_url = variant
logger.info(f" ✅ Рабочий URL найден!")
break
# Небольшая задержка между попытками
time.sleep(0.5)
# Сохраняем результат
if working_url and result:
if save_to_db(hotel['id'], working_url, result):
logger.info(" ✅ Успешно спарсено и сохранено")
success += 1
else:
logger.info(" ⚠️ Спарсено но не сохранено")
failed += 1
else:
logger.info("Все варианты URL не сработали")
# Сохраняем failed статус
save_to_db(hotel['id'], hotel['website_address'],
{"html": None, "error": "All URL variants failed"})
failed += 1
# Задержка между отелями
time.sleep(1)
except Exception as e:
logger.error(f" 💥 КРИТИЧЕСКАЯ ОШИБКА: {e}")
failed += 1
# Продолжаем работу даже при ошибке
continue
logger.info("\n" + "=" * 70)
logger.info(f"✅ Успешно: {success}")
logger.info(f"❌ Ошибок: {failed}")
logger.info("=" * 70)
if __name__ == "__main__":
import sys
region = sys.argv[1] if len(sys.argv) > 1 else None
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
process_failed_hotels(region, limit)