🚀 Full project sync: Hotels RAG & Audit System

 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
This commit is contained in:
Фёдор
2025-10-27 22:49:42 +03:00
parent 0cf3297290
commit 684fada337
94 changed files with 14891 additions and 911 deletions

331
browserless_crawler_parallel.py Executable file
View File

@@ -0,0 +1,331 @@
#!/usr/bin/env python3
"""
Многопоточный краулер отелей через Browserless API
"""
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import logging
from datetime import datetime
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Настройка логирования
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - [%(threadName)s] - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'browserless_parallel_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Конфигурация
BROWSERLESS_URL = "http://147.45.146.17:3000/function?token=9ahhnpjkchxtcho9"
MAX_WORKERS = 5 # Количество параллельных потоков
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
# Счётчики (потокобезопасные)
stats_lock = threading.Lock()
stats = {'success': 0, 'failed': 0, 'processed': 0}
# JavaScript функция для Browserless
BROWSER_FUNCTION = """
export default async function ({ page, context }) {
const targetUrl = context.target_url;
await page.setViewport({ width: 1920, height: 1080 });
await page.setExtraHTTPHeaders({
"Accept-Language": "ru,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Upgrade-Insecure-Requests": "1",
});
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
);
try {
await page.goto(targetUrl, { waitUntil: "networkidle2", timeout: 30000 });
try {
await page.waitForSelector(
".cookie-accept, .cookie-close, [class*='cookie'] button",
{ timeout: 2000 }
);
const btns = await page.$$(".cookie-accept, .cookie-close, [class*='cookie'] button");
if (btns[0]) await btns[0].click();
} catch (_) {}
await page.waitForTimeout(1000);
const data = await page.evaluate(() => {
return {
html: document.documentElement.outerHTML,
title: document.title,
url: window.location.href,
status: 200
};
});
return data;
} catch (error) {
return {
html: null,
title: null,
url: targetUrl,
status: 0,
error: error.message
};
}
}
"""
def normalize_url(url: str) -> list:
"""Создаёт список вариантов URL"""
urls = []
url = url.strip()
if url.startswith('http://') or url.startswith('https://'):
urls.append(url)
if url.startswith('https://'):
urls.append(url.replace('https://', 'http://'))
else:
urls.append(url.replace('http://', 'https://'))
else:
urls.append(f"https://{url}")
urls.append(f"http://{url}")
if url.startswith('www.'):
url_no_www = url[4:]
urls.append(f"https://{url_no_www}")
urls.append(f"http://{url_no_www}")
else:
urls.append(f"https://www.{url}")
urls.append(f"http://www.{url}")
return urls
def crawl_with_browserless(url: str) -> dict:
"""Краулинг через Browserless API"""
try:
payload = {
"code": BROWSER_FUNCTION,
"context": {"target_url": url}
}
response = requests.post(BROWSERLESS_URL, json=payload, timeout=60)
if response.status_code == 200:
result = response.json()
return result
else:
return {"html": None, "error": f"HTTP {response.status_code}"}
except Exception as e:
return {"html": None, "error": str(e)}
def save_to_db(hotel_id: str, url: str, result: dict):
"""Сохранение в БД (с отдельным подключением для потока)"""
conn = psycopg2.connect(**DB_CONFIG)
try:
cur = conn.cursor()
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
if result and result.get('html') and result.get('html') != 'null':
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, page_title, crawled_at, status_code)
VALUES (%s, %s, %s, %s, %s, %s)
""", (
hotel_id,
result.get('url', url),
result['html'],
result.get('title'),
datetime.now(),
result.get('status', 200)
))
cur.execute("""
INSERT INTO hotel_website_meta
(hotel_id, main_url, pages_crawled, total_size_bytes, crawl_status,
crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
main_url = EXCLUDED.main_url,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_status = EXCLUDED.crawl_status,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (
hotel_id, url, 1, len(result['html']), 'completed',
datetime.now(), datetime.now()
))
conn.commit()
return True
else:
error_msg = result.get('error', 'No HTML') if result else 'No response'
cur.execute("""
INSERT INTO hotel_website_meta
(hotel_id, main_url, crawl_status, error_message,
crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (hotel_id, url, 'failed', error_msg, datetime.now(), datetime.now()))
conn.commit()
return False
finally:
cur.close()
conn.close()
def process_hotel(hotel: dict, total: int, index: int):
"""Обработка одного отеля"""
try:
logger.info(f"[{index}/{total}] {hotel['full_name'][:50]}")
url_variants = normalize_url(hotel['website_address'])
result = None
working_url = None
for variant in url_variants:
result = crawl_with_browserless(variant)
if result and result.get('html') and result.get('html') != 'null':
working_url = variant
logger.info(f" ✅ Найден: {variant}")
break
time.sleep(0.3)
if working_url and result:
if save_to_db(hotel['id'], working_url, result):
with stats_lock:
stats['success'] += 1
stats['processed'] += 1
return True
save_to_db(hotel['id'], hotel['website_address'],
{"html": None, "error": "All variants failed"})
with stats_lock:
stats['failed'] += 1
stats['processed'] += 1
return False
except Exception as e:
logger.error(f" ❌ Ошибка: {e}")
with stats_lock:
stats['failed'] += 1
stats['processed'] += 1
return False
def main():
import sys
region = sys.argv[1] if len(sys.argv) > 1 and sys.argv[1] != 'None' else None
limit = int(sys.argv[2]) if len(sys.argv) > 2 and sys.argv[2] != 'None' else None
workers = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3] != 'None' else MAX_WORKERS
# Получаем список отелей
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
query = """
SELECT h.id, h.full_name, h.website_address
FROM hotel_main h
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
WHERE hwm.crawl_status = 'failed'
AND h.website_address IS NOT NULL
AND h.website_address != ''
"""
if region:
query += " AND h.region_name = %s"
cur.execute(query, (region,))
else:
cur.execute(query)
hotels = cur.fetchall()
if limit:
hotels = hotels[:limit]
cur.close()
conn.close()
logger.info("=" * 70)
logger.info("🚀 МНОГОПОТОЧНЫЙ BROWSERLESS КРАУЛЕР")
if region:
logger.info(f"📍 Регион: {region}")
logger.info(f"📊 Отелей: {len(hotels)}")
logger.info(f"🔧 Потоков: {workers}")
logger.info("=" * 70)
start_time = time.time()
# Многопоточная обработка
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(process_hotel, hotel, len(hotels), i): hotel
for i, hotel in enumerate(hotels, 1)
}
for future in as_completed(futures):
try:
future.result()
# Промежуточная статистика каждые 50 отелей
if stats['processed'] % 50 == 0:
elapsed = time.time() - start_time
rate = stats['processed'] / elapsed if elapsed > 0 else 0
remaining = (len(hotels) - stats['processed']) / rate if rate > 0 else 0
logger.info("")
logger.info("📊 ПРОМЕЖУТОЧНАЯ СТАТИСТИКА:")
logger.info(f" Обработано: {stats['processed']}/{len(hotels)}")
logger.info(f" Успешно: {stats['success']}")
logger.info(f" Ошибок: {stats['failed']}")
logger.info(f" Скорость: {rate:.2f} отелей/сек")
logger.info(f" Осталось: ~{remaining/60:.1f} мин")
logger.info("")
except Exception as e:
logger.error(f"Future error: {e}")
elapsed = time.time() - start_time
logger.info("\n" + "=" * 70)
logger.info("✅ ЗАВЕРШЕНО!")
logger.info(f" Успешно: {stats['success']}")
logger.info(f" Ошибок: {stats['failed']}")
logger.info(f" Время: {elapsed/60:.1f} мин")
logger.info(f" Скорость: {len(hotels)/elapsed:.2f} отелей/сек")
logger.info("=" * 70)
if __name__ == "__main__":
main()