✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
218 lines
8.5 KiB
Python
Executable File
218 lines
8.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Перекраулинг failed отелей Питера с более мягкими настройками
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from urllib.parse import unquote
|
||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||
import logging
|
||
import sys
|
||
from datetime import datetime
|
||
import re
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||
logging.StreamHandler(sys.stdout)
|
||
]
|
||
)
|
||
|
||
DB_CONFIG = {
|
||
'host': '147.45.189.234',
|
||
'port': 5432,
|
||
'database': 'default_db',
|
||
'user': 'gen_user',
|
||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
}
|
||
|
||
def normalize_url(url):
|
||
"""Нормализовать URL"""
|
||
if not url:
|
||
return None
|
||
url = url.strip()
|
||
if not url.startswith(('http://', 'https://')):
|
||
# Попробуем сначала https
|
||
return f'https://{url}'
|
||
return url
|
||
|
||
def try_http_fallback(url):
|
||
"""Попробовать HTTP если HTTPS не работает"""
|
||
if url.startswith('https://'):
|
||
return url.replace('https://', 'http://')
|
||
return None
|
||
|
||
def crawl_hotel(hotel_id, hotel_name, website_address):
|
||
"""Краулинг одного отеля"""
|
||
url = normalize_url(website_address)
|
||
if not url:
|
||
logging.warning(f" ⚠️ Нет URL")
|
||
return False
|
||
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
try:
|
||
with sync_playwright() as p:
|
||
browser = p.chromium.launch(headless=True)
|
||
context = browser.new_context(
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||
ignore_https_errors=True, # Игнорировать SSL ошибки
|
||
java_script_enabled=True
|
||
)
|
||
page = context.new_page()
|
||
|
||
# Пробуем HTTPS
|
||
try:
|
||
logging.info(f" 🌐 Пробуем: {url}")
|
||
page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд
|
||
html = page.content()
|
||
|
||
if html and len(html) > 100:
|
||
# Успешно!
|
||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||
VALUES (%s, %s, %s, %s)
|
||
""", (hotel_id, url, html, datetime.now()))
|
||
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||
VALUES (%s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
crawl_status = EXCLUDED.crawl_status,
|
||
pages_crawled = EXCLUDED.pages_crawled,
|
||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||
error_message = NULL
|
||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||
|
||
conn.commit()
|
||
logging.info(f" ✅ Успешно! {len(html):,} байт")
|
||
browser.close()
|
||
cur.close()
|
||
conn.close()
|
||
return True
|
||
|
||
except Exception as e:
|
||
# Пробуем HTTP
|
||
http_url = try_http_fallback(url)
|
||
if http_url:
|
||
try:
|
||
logging.info(f" 🔄 Пробуем HTTP: {http_url}")
|
||
page.goto(http_url, wait_until='domcontentloaded', timeout=60000)
|
||
html = page.content()
|
||
|
||
if html and len(html) > 100:
|
||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||
VALUES (%s, %s, %s, %s)
|
||
""", (hotel_id, http_url, html, datetime.now()))
|
||
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||
VALUES (%s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
crawl_status = EXCLUDED.crawl_status,
|
||
pages_crawled = EXCLUDED.pages_crawled,
|
||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||
error_message = NULL
|
||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||
|
||
conn.commit()
|
||
logging.info(f" ✅ HTTP сработал! {len(html):,} байт")
|
||
browser.close()
|
||
cur.close()
|
||
conn.close()
|
||
return True
|
||
except Exception as e2:
|
||
logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}")
|
||
raise e # Вернём оригинальную ошибку
|
||
else:
|
||
raise
|
||
|
||
browser.close()
|
||
|
||
except Exception as e:
|
||
error_msg = str(e)[:500]
|
||
logging.error(f" ❌ Ошибка: {error_msg}")
|
||
|
||
# Обновить статус как failed
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
crawl_status = EXCLUDED.crawl_status,
|
||
error_message = EXCLUDED.error_message,
|
||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||
""", (hotel_id, 'failed', error_msg, datetime.now(), datetime.now()))
|
||
conn.commit()
|
||
|
||
finally:
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return False
|
||
|
||
def main():
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
# Получить failed отели
|
||
cur.execute("""
|
||
SELECT h.id, h.full_name, h.website_address
|
||
FROM hotel_main h
|
||
JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||
AND hwm.crawl_status = 'failed'
|
||
ORDER BY h.full_name
|
||
""")
|
||
hotels = cur.fetchall()
|
||
total = len(hotels)
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
logging.info("=" * 60)
|
||
logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА")
|
||
logging.info("=" * 60)
|
||
logging.info(f"Всего отелей: {total}")
|
||
logging.info("")
|
||
|
||
success = 0
|
||
failed = 0
|
||
|
||
for i, hotel in enumerate(hotels, 1):
|
||
logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}")
|
||
|
||
if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']):
|
||
success += 1
|
||
else:
|
||
failed += 1
|
||
|
||
if i % 10 == 0:
|
||
logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок")
|
||
|
||
logging.info("")
|
||
logging.info("=" * 60)
|
||
logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН")
|
||
logging.info("=" * 60)
|
||
logging.info(f"✅ Успешно: {success}")
|
||
logging.info(f"❌ Ошибок: {failed}")
|
||
logging.info(f"📊 Успех: {success*100//total if total else 0}%")
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|
||
|
||
|
||
|