🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
331
browserless_crawler_parallel.py
Executable file
331
browserless_crawler_parallel.py
Executable file
@@ -0,0 +1,331 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Многопоточный краулер отелей через Browserless API
|
||||
"""
|
||||
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from urllib.parse import unquote
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
|
||||
# Настройка логирования
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - [%(threadName)s] - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'browserless_parallel_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Конфигурация
|
||||
BROWSERLESS_URL = "http://147.45.146.17:3000/function?token=9ahhnpjkchxtcho9"
|
||||
MAX_WORKERS = 5 # Количество параллельных потоков
|
||||
DB_CONFIG = {
|
||||
'host': '147.45.189.234',
|
||||
'port': 5432,
|
||||
'database': 'default_db',
|
||||
'user': 'gen_user',
|
||||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||||
}
|
||||
|
||||
# Счётчики (потокобезопасные)
|
||||
stats_lock = threading.Lock()
|
||||
stats = {'success': 0, 'failed': 0, 'processed': 0}
|
||||
|
||||
# JavaScript функция для Browserless
|
||||
BROWSER_FUNCTION = """
|
||||
export default async function ({ page, context }) {
|
||||
const targetUrl = context.target_url;
|
||||
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setExtraHTTPHeaders({
|
||||
"Accept-Language": "ru,en;q=0.9",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
});
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
);
|
||||
|
||||
try {
|
||||
await page.goto(targetUrl, { waitUntil: "networkidle2", timeout: 30000 });
|
||||
|
||||
try {
|
||||
await page.waitForSelector(
|
||||
".cookie-accept, .cookie-close, [class*='cookie'] button",
|
||||
{ timeout: 2000 }
|
||||
);
|
||||
const btns = await page.$$(".cookie-accept, .cookie-close, [class*='cookie'] button");
|
||||
if (btns[0]) await btns[0].click();
|
||||
} catch (_) {}
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
const data = await page.evaluate(() => {
|
||||
return {
|
||||
html: document.documentElement.outerHTML,
|
||||
title: document.title,
|
||||
url: window.location.href,
|
||||
status: 200
|
||||
};
|
||||
});
|
||||
|
||||
return data;
|
||||
|
||||
} catch (error) {
|
||||
return {
|
||||
html: null,
|
||||
title: null,
|
||||
url: targetUrl,
|
||||
status: 0,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def normalize_url(url: str) -> list:
|
||||
"""Создаёт список вариантов URL"""
|
||||
urls = []
|
||||
url = url.strip()
|
||||
|
||||
if url.startswith('http://') or url.startswith('https://'):
|
||||
urls.append(url)
|
||||
if url.startswith('https://'):
|
||||
urls.append(url.replace('https://', 'http://'))
|
||||
else:
|
||||
urls.append(url.replace('http://', 'https://'))
|
||||
else:
|
||||
urls.append(f"https://{url}")
|
||||
urls.append(f"http://{url}")
|
||||
|
||||
if url.startswith('www.'):
|
||||
url_no_www = url[4:]
|
||||
urls.append(f"https://{url_no_www}")
|
||||
urls.append(f"http://{url_no_www}")
|
||||
else:
|
||||
urls.append(f"https://www.{url}")
|
||||
urls.append(f"http://www.{url}")
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def crawl_with_browserless(url: str) -> dict:
|
||||
"""Краулинг через Browserless API"""
|
||||
try:
|
||||
payload = {
|
||||
"code": BROWSER_FUNCTION,
|
||||
"context": {"target_url": url}
|
||||
}
|
||||
|
||||
response = requests.post(BROWSERLESS_URL, json=payload, timeout=60)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return result
|
||||
else:
|
||||
return {"html": None, "error": f"HTTP {response.status_code}"}
|
||||
|
||||
except Exception as e:
|
||||
return {"html": None, "error": str(e)}
|
||||
|
||||
|
||||
def save_to_db(hotel_id: str, url: str, result: dict):
|
||||
"""Сохранение в БД (с отдельным подключением для потока)"""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||||
|
||||
if result and result.get('html') and result.get('html') != 'null':
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_raw (hotel_id, url, html, page_title, crawled_at, status_code)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
hotel_id,
|
||||
result.get('url', url),
|
||||
result['html'],
|
||||
result.get('title'),
|
||||
datetime.now(),
|
||||
result.get('status', 200)
|
||||
))
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta
|
||||
(hotel_id, main_url, pages_crawled, total_size_bytes, crawl_status,
|
||||
crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
main_url = EXCLUDED.main_url,
|
||||
pages_crawled = EXCLUDED.pages_crawled,
|
||||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||||
error_message = NULL
|
||||
""", (
|
||||
hotel_id, url, 1, len(result['html']), 'completed',
|
||||
datetime.now(), datetime.now()
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
return True
|
||||
else:
|
||||
error_msg = result.get('error', 'No HTML') if result else 'No response'
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta
|
||||
(hotel_id, main_url, crawl_status, error_message,
|
||||
crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
error_message = EXCLUDED.error_message,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||||
""", (hotel_id, url, 'failed', error_msg, datetime.now(), datetime.now()))
|
||||
conn.commit()
|
||||
return False
|
||||
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def process_hotel(hotel: dict, total: int, index: int):
|
||||
"""Обработка одного отеля"""
|
||||
try:
|
||||
logger.info(f"[{index}/{total}] {hotel['full_name'][:50]}")
|
||||
|
||||
url_variants = normalize_url(hotel['website_address'])
|
||||
|
||||
result = None
|
||||
working_url = None
|
||||
|
||||
for variant in url_variants:
|
||||
result = crawl_with_browserless(variant)
|
||||
|
||||
if result and result.get('html') and result.get('html') != 'null':
|
||||
working_url = variant
|
||||
logger.info(f" ✅ Найден: {variant}")
|
||||
break
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
if working_url and result:
|
||||
if save_to_db(hotel['id'], working_url, result):
|
||||
with stats_lock:
|
||||
stats['success'] += 1
|
||||
stats['processed'] += 1
|
||||
return True
|
||||
|
||||
save_to_db(hotel['id'], hotel['website_address'],
|
||||
{"html": None, "error": "All variants failed"})
|
||||
|
||||
with stats_lock:
|
||||
stats['failed'] += 1
|
||||
stats['processed'] += 1
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ❌ Ошибка: {e}")
|
||||
with stats_lock:
|
||||
stats['failed'] += 1
|
||||
stats['processed'] += 1
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
region = sys.argv[1] if len(sys.argv) > 1 and sys.argv[1] != 'None' else None
|
||||
limit = int(sys.argv[2]) if len(sys.argv) > 2 and sys.argv[2] != 'None' else None
|
||||
workers = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3] != 'None' else MAX_WORKERS
|
||||
|
||||
# Получаем список отелей
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT h.id, h.full_name, h.website_address
|
||||
FROM hotel_main h
|
||||
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
||||
WHERE hwm.crawl_status = 'failed'
|
||||
AND h.website_address IS NOT NULL
|
||||
AND h.website_address != ''
|
||||
"""
|
||||
|
||||
if region:
|
||||
query += " AND h.region_name = %s"
|
||||
cur.execute(query, (region,))
|
||||
else:
|
||||
cur.execute(query)
|
||||
|
||||
hotels = cur.fetchall()
|
||||
|
||||
if limit:
|
||||
hotels = hotels[:limit]
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
logger.info("=" * 70)
|
||||
logger.info("🚀 МНОГОПОТОЧНЫЙ BROWSERLESS КРАУЛЕР")
|
||||
if region:
|
||||
logger.info(f"📍 Регион: {region}")
|
||||
logger.info(f"📊 Отелей: {len(hotels)}")
|
||||
logger.info(f"🔧 Потоков: {workers}")
|
||||
logger.info("=" * 70)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Многопоточная обработка
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(process_hotel, hotel, len(hotels), i): hotel
|
||||
for i, hotel in enumerate(hotels, 1)
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
|
||||
# Промежуточная статистика каждые 50 отелей
|
||||
if stats['processed'] % 50 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = stats['processed'] / elapsed if elapsed > 0 else 0
|
||||
remaining = (len(hotels) - stats['processed']) / rate if rate > 0 else 0
|
||||
|
||||
logger.info("")
|
||||
logger.info("📊 ПРОМЕЖУТОЧНАЯ СТАТИСТИКА:")
|
||||
logger.info(f" Обработано: {stats['processed']}/{len(hotels)}")
|
||||
logger.info(f" Успешно: {stats['success']}")
|
||||
logger.info(f" Ошибок: {stats['failed']}")
|
||||
logger.info(f" Скорость: {rate:.2f} отелей/сек")
|
||||
logger.info(f" Осталось: ~{remaining/60:.1f} мин")
|
||||
logger.info("")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Future error: {e}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("✅ ЗАВЕРШЕНО!")
|
||||
logger.info(f" Успешно: {stats['success']}")
|
||||
logger.info(f" Ошибок: {stats['failed']}")
|
||||
logger.info(f" Время: {elapsed/60:.1f} мин")
|
||||
logger.info(f" Скорость: {len(hotels)/elapsed:.2f} отелей/сек")
|
||||
logger.info("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user