Files
hotels/retry_spb_failed.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

218 lines
8.5 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Перекраулинг failed отелей Питера с более мягкими настройками
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import logging
import sys
from datetime import datetime
import re
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler(sys.stdout)
]
)
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
def normalize_url(url):
"""Нормализовать URL"""
if not url:
return None
url = url.strip()
if not url.startswith(('http://', 'https://')):
# Попробуем сначала https
return f'https://{url}'
return url
def try_http_fallback(url):
"""Попробовать HTTP если HTTPS не работает"""
if url.startswith('https://'):
return url.replace('https://', 'http://')
return None
def crawl_hotel(hotel_id, hotel_name, website_address):
"""Краулинг одного отеля"""
url = normalize_url(website_address)
if not url:
logging.warning(f" ⚠️ Нет URL")
return False
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
ignore_https_errors=True, # Игнорировать SSL ошибки
java_script_enabled=True
)
page = context.new_page()
# Пробуем HTTPS
try:
logging.info(f" 🌐 Пробуем: {url}")
page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд
html = page.content()
if html and len(html) > 100:
# Успешно!
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, url, html, datetime.now()))
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
conn.commit()
logging.info(f" ✅ Успешно! {len(html):,} байт")
browser.close()
cur.close()
conn.close()
return True
except Exception as e:
# Пробуем HTTP
http_url = try_http_fallback(url)
if http_url:
try:
logging.info(f" 🔄 Пробуем HTTP: {http_url}")
page.goto(http_url, wait_until='domcontentloaded', timeout=60000)
html = page.content()
if html and len(html) > 100:
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, http_url, html, datetime.now()))
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
conn.commit()
logging.info(f" ✅ HTTP сработал! {len(html):,} байт")
browser.close()
cur.close()
conn.close()
return True
except Exception as e2:
logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}")
raise e # Вернём оригинальную ошибку
else:
raise
browser.close()
except Exception as e:
error_msg = str(e)[:500]
logging.error(f" ❌ Ошибка: {error_msg}")
# Обновить статус как failed
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (hotel_id, 'failed', error_msg, datetime.now(), datetime.now()))
conn.commit()
finally:
cur.close()
conn.close()
return False
def main():
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
# Получить failed отели
cur.execute("""
SELECT h.id, h.full_name, h.website_address
FROM hotel_main h
JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
WHERE h.region_name = 'г. Санкт-Петербург'
AND hwm.crawl_status = 'failed'
ORDER BY h.full_name
""")
hotels = cur.fetchall()
total = len(hotels)
cur.close()
conn.close()
logging.info("=" * 60)
logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА")
logging.info("=" * 60)
logging.info(f"Всего отелей: {total}")
logging.info("")
success = 0
failed = 0
for i, hotel in enumerate(hotels, 1):
logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}")
if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']):
success += 1
else:
failed += 1
if i % 10 == 0:
logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок")
logging.info("")
logging.info("=" * 60)
logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН")
logging.info("=" * 60)
logging.info(f"✅ Успешно: {success}")
logging.info(f"❌ Ошибок: {failed}")
logging.info(f"📊 Успех: {success*100//total if total else 0}%")
if __name__ == '__main__':
main()