🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
217
retry_spb_failed.py
Executable file
217
retry_spb_failed.py
Executable file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Перекраулинг failed отелей Питера с более мягкими настройками
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from urllib.parse import unquote
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': '147.45.189.234',
|
||||
'port': 5432,
|
||||
'database': 'default_db',
|
||||
'user': 'gen_user',
|
||||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||||
}
|
||||
|
||||
def normalize_url(url):
|
||||
"""Нормализовать URL"""
|
||||
if not url:
|
||||
return None
|
||||
url = url.strip()
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
# Попробуем сначала https
|
||||
return f'https://{url}'
|
||||
return url
|
||||
|
||||
def try_http_fallback(url):
|
||||
"""Попробовать HTTP если HTTPS не работает"""
|
||||
if url.startswith('https://'):
|
||||
return url.replace('https://', 'http://')
|
||||
return None
|
||||
|
||||
def crawl_hotel(hotel_id, hotel_name, website_address):
|
||||
"""Краулинг одного отеля"""
|
||||
url = normalize_url(website_address)
|
||||
if not url:
|
||||
logging.warning(f" ⚠️ Нет URL")
|
||||
return False
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
ignore_https_errors=True, # Игнорировать SSL ошибки
|
||||
java_script_enabled=True
|
||||
)
|
||||
page = context.new_page()
|
||||
|
||||
# Пробуем HTTPS
|
||||
try:
|
||||
logging.info(f" 🌐 Пробуем: {url}")
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд
|
||||
html = page.content()
|
||||
|
||||
if html and len(html) > 100:
|
||||
# Успешно!
|
||||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
""", (hotel_id, url, html, datetime.now()))
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
pages_crawled = EXCLUDED.pages_crawled,
|
||||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||||
error_message = NULL
|
||||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||||
|
||||
conn.commit()
|
||||
logging.info(f" ✅ Успешно! {len(html):,} байт")
|
||||
browser.close()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Пробуем HTTP
|
||||
http_url = try_http_fallback(url)
|
||||
if http_url:
|
||||
try:
|
||||
logging.info(f" 🔄 Пробуем HTTP: {http_url}")
|
||||
page.goto(http_url, wait_until='domcontentloaded', timeout=60000)
|
||||
html = page.content()
|
||||
|
||||
if html and len(html) > 100:
|
||||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
""", (hotel_id, http_url, html, datetime.now()))
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
pages_crawled = EXCLUDED.pages_crawled,
|
||||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||||
error_message = NULL
|
||||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||||
|
||||
conn.commit()
|
||||
logging.info(f" ✅ HTTP сработал! {len(html):,} байт")
|
||||
browser.close()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e2:
|
||||
logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}")
|
||||
raise e # Вернём оригинальную ошибку
|
||||
else:
|
||||
raise
|
||||
|
||||
browser.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)[:500]
|
||||
logging.error(f" ❌ Ошибка: {error_msg}")
|
||||
|
||||
# Обновить статус как failed
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
error_message = EXCLUDED.error_message,
|
||||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||||
""", (hotel_id, 'failed', error_msg, datetime.now(), datetime.now()))
|
||||
conn.commit()
|
||||
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return False
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Получить failed отели
|
||||
cur.execute("""
|
||||
SELECT h.id, h.full_name, h.website_address
|
||||
FROM hotel_main h
|
||||
JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
AND hwm.crawl_status = 'failed'
|
||||
ORDER BY h.full_name
|
||||
""")
|
||||
hotels = cur.fetchall()
|
||||
total = len(hotels)
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
logging.info("=" * 60)
|
||||
logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА")
|
||||
logging.info("=" * 60)
|
||||
logging.info(f"Всего отелей: {total}")
|
||||
logging.info("")
|
||||
|
||||
success = 0
|
||||
failed = 0
|
||||
|
||||
for i, hotel in enumerate(hotels, 1):
|
||||
logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}")
|
||||
|
||||
if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']):
|
||||
success += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
if i % 10 == 0:
|
||||
logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок")
|
||||
|
||||
logging.info("")
|
||||
logging.info("=" * 60)
|
||||
logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН")
|
||||
logging.info("=" * 60)
|
||||
logging.info(f"✅ Успешно: {success}")
|
||||
logging.info(f"❌ Ошибок: {failed}")
|
||||
logging.info(f"📊 Успех: {success*100//total if total else 0}%")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user