✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
117 lines
3.6 KiB
Python
117 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Перекраулинг отелей со статусом 'failed'
|
||
Более мягкие настройки: HTTP fallback, игнорирование SSL ошибок
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from urllib.parse import unquote
|
||
import logging
|
||
from datetime import datetime
|
||
|
||
# Настройка логирования
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(f'retry_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Конфигурация БД
|
||
DB_CONFIG = {
|
||
'host': '147.45.189.234',
|
||
'port': 5432,
|
||
'database': 'default_db',
|
||
'user': 'gen_user',
|
||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
}
|
||
|
||
|
||
def get_failed_hotels(region_name=None):
|
||
"""Получить отели со статусом failed"""
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
query = """
|
||
SELECT h.id, h.full_name, h.website_address, hwm.error_message
|
||
FROM hotel_main h
|
||
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
|
||
WHERE hwm.crawl_status = 'failed'
|
||
"""
|
||
|
||
if region_name:
|
||
query += " AND h.region_name = %s"
|
||
cur.execute(query, (region_name,))
|
||
else:
|
||
cur.execute(query)
|
||
|
||
hotels = cur.fetchall()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return hotels
|
||
|
||
|
||
def main():
|
||
import sys
|
||
|
||
region = sys.argv[1] if len(sys.argv) > 1 else None
|
||
|
||
logger.info("=" * 70)
|
||
logger.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ")
|
||
if region:
|
||
logger.info(f"📍 Регион: {region}")
|
||
else:
|
||
logger.info("📍 Регион: ВСЕ")
|
||
logger.info("=" * 70)
|
||
|
||
# Получаем failed отели
|
||
hotels = get_failed_hotels(region)
|
||
logger.info(f"\n📊 Найдено {len(hotels)} failed отелей")
|
||
|
||
if len(hotels) == 0:
|
||
logger.info("✅ Нет failed отелей!")
|
||
return
|
||
|
||
# Статистика ошибок
|
||
errors = {}
|
||
for hotel in hotels:
|
||
error = hotel['error_message'] or 'Unknown'
|
||
error_type = error.split(':')[0] if ':' in error else error
|
||
errors[error_type] = errors.get(error_type, 0) + 1
|
||
|
||
logger.info("\n📊 СТАТИСТИКА ОШИБОК:")
|
||
for error_type, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
|
||
logger.info(f" {error_type}: {count}")
|
||
|
||
# Сохраняем список в файл для краулера
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
filename = f"failed_hotels_{region or 'all'}_{timestamp}.txt"
|
||
|
||
with open(filename, 'w') as f:
|
||
for hotel in hotels:
|
||
f.write(f"{hotel['id']}\t{hotel['full_name']}\t{hotel['website_address']}\n")
|
||
|
||
logger.info(f"\n💾 Список сохранён в: {filename}")
|
||
logger.info(f"\n📋 ЗАПУСК КРАУЛЕРА:")
|
||
logger.info(f" Можно запустить smart_crawler.py с этим списком")
|
||
logger.info(f" Или использовать single_hotel_crawler.py для каждого отеля")
|
||
|
||
# Выводим первые 10 отелей
|
||
logger.info(f"\n📋 ПЕРВЫЕ 10 ОТЕЛЕЙ:")
|
||
for i, hotel in enumerate(hotels[:10], 1):
|
||
logger.info(f" {i}. {hotel['full_name']}")
|
||
logger.info(f" Сайт: {hotel['website_address']}")
|
||
logger.info(f" ID: {hotel['id']}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
|
||
|