Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
196
scraper_missing.py
Normal file
196
scraper_missing.py
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Парсер для ОСТАВШИХСЯ необработанных отелей
|
||||
С автоматическим переподключением к БД
|
||||
"""
|
||||
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch, Json
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
from typing import Optional, Dict, List
|
||||
import json
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'scraper_missing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
|
||||
RATE_LIMIT_DELAY = 0.1
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
class MissingScraper:
|
||||
def __init__(self, limit=None, offset=0):
|
||||
self.limit = limit
|
||||
self.offset = offset
|
||||
self.conn = None
|
||||
self.session = requests.Session()
|
||||
self.processed = 0
|
||||
self.errors = 0
|
||||
|
||||
def reconnect_db(self):
|
||||
"""Переподключение к БД"""
|
||||
if self.conn:
|
||||
try:
|
||||
self.conn.close()
|
||||
except:
|
||||
pass
|
||||
self.conn = psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def get_missing_hotel_ids(self):
|
||||
"""Получить ID необработанных отелей"""
|
||||
self.reconnect_db()
|
||||
cur = self.conn.cursor()
|
||||
|
||||
sql = """
|
||||
SELECT m.id
|
||||
FROM hotel_main m
|
||||
LEFT JOIN hotel_raw_json r ON m.id = r.hotel_id
|
||||
WHERE r.hotel_id IS NULL
|
||||
ORDER BY m.id
|
||||
"""
|
||||
|
||||
if self.limit:
|
||||
sql += f" LIMIT {self.limit} OFFSET {self.offset}"
|
||||
|
||||
cur.execute(sql)
|
||||
ids = [row[0] for row in cur.fetchall()]
|
||||
cur.close()
|
||||
return ids
|
||||
|
||||
def api_request(self, url: str) -> Optional[Dict]:
|
||||
"""API запрос"""
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_hotel_details(self, hotel_id: str) -> Dict:
|
||||
"""Получить детали отеля"""
|
||||
return {
|
||||
'hotel_id': hotel_id,
|
||||
'main': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"),
|
||||
'additional_info': self.api_request(f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"),
|
||||
'sanatorium': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"),
|
||||
'drawer': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer")
|
||||
}
|
||||
|
||||
def save_batch(self, batch: List[Dict]):
|
||||
"""Сохранить батч с переподключением"""
|
||||
if not batch:
|
||||
return
|
||||
|
||||
# Переподключаемся перед каждым сохранением
|
||||
self.reconnect_db()
|
||||
cur = self.conn.cursor()
|
||||
|
||||
try:
|
||||
# Сохраняем в hotel_raw_json
|
||||
records = [(item['hotel_id'], Json(item['main']), Json(item['additional_info']),
|
||||
Json(item['sanatorium']), Json(item['drawer'])) for item in batch]
|
||||
|
||||
sql = """
|
||||
INSERT INTO hotel_raw_json
|
||||
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
main_data = EXCLUDED.main_data,
|
||||
additional_info = EXCLUDED.additional_info,
|
||||
sanatorium_data = EXCLUDED.sanatorium_data,
|
||||
drawer_data = EXCLUDED.drawer_data
|
||||
"""
|
||||
|
||||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||||
self.conn.commit()
|
||||
logger.info(f"✓ Сохранено {len(batch)} отелей")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка сохранения: {e}")
|
||||
self.conn.rollback()
|
||||
self.errors += len(batch)
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
def run(self):
|
||||
"""Запуск"""
|
||||
start = datetime.now()
|
||||
logger.info(f"🚀 Запуск парсинга НЕОБРАБОТАННЫХ отелей")
|
||||
|
||||
# Получаем список необработанных
|
||||
hotel_ids = self.get_missing_hotel_ids()
|
||||
total = len(hotel_ids)
|
||||
logger.info(f"📊 Необработанных отелей: {total}")
|
||||
|
||||
if total == 0:
|
||||
logger.info("✅ Все отели уже обработаны!")
|
||||
return
|
||||
|
||||
batch = []
|
||||
|
||||
for idx, hotel_id in enumerate(hotel_ids, 1):
|
||||
try:
|
||||
details = self.get_hotel_details(hotel_id)
|
||||
batch.append(details)
|
||||
self.processed += 1
|
||||
|
||||
# Сохраняем батч
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
self.save_batch(batch)
|
||||
batch = []
|
||||
|
||||
# Прогресс
|
||||
if idx % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
speed = self.processed / elapsed
|
||||
eta_min = (total - idx) / speed / 60
|
||||
logger.info(f"Progress: {idx}/{total} ({idx/total*100:.1f}%) | "
|
||||
f"Speed: {speed:.1f}/sec | ETA: {eta_min:.0f} min")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка обработки {hotel_id}: {e}")
|
||||
self.errors += 1
|
||||
|
||||
# Остаток
|
||||
if batch:
|
||||
self.save_batch(batch)
|
||||
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
logger.info(f"\n{'='*70}")
|
||||
logger.info(f"Завершено: {self.processed}/{total}")
|
||||
logger.info(f"Ошибок: {self.errors}")
|
||||
logger.info(f"Время: {elapsed/60:.1f} минут")
|
||||
logger.info(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
||||
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
|
||||
|
||||
scraper = MissingScraper(limit=limit, offset=offset)
|
||||
scraper.run()
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user