197 lines
6.5 KiB
Python
197 lines
6.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Парсер для ОСТАВШИХСЯ необработанных отелей
|
|||
|
|
С автоматическим переподключением к БД
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import requests
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import execute_batch, Json
|
|||
|
|
import time
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
from typing import Optional, Dict, List
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'scraper_missing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
|
|||
|
|
RATE_LIMIT_DELAY = 0.1
|
|||
|
|
BATCH_SIZE = 50
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MissingScraper:
|
|||
|
|
def __init__(self, limit=None, offset=0):
|
|||
|
|
self.limit = limit
|
|||
|
|
self.offset = offset
|
|||
|
|
self.conn = None
|
|||
|
|
self.session = requests.Session()
|
|||
|
|
self.processed = 0
|
|||
|
|
self.errors = 0
|
|||
|
|
|
|||
|
|
def reconnect_db(self):
|
|||
|
|
"""Переподключение к БД"""
|
|||
|
|
if self.conn:
|
|||
|
|
try:
|
|||
|
|
self.conn.close()
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
self.conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
|
|||
|
|
def get_missing_hotel_ids(self):
|
|||
|
|
"""Получить ID необработанных отелей"""
|
|||
|
|
self.reconnect_db()
|
|||
|
|
cur = self.conn.cursor()
|
|||
|
|
|
|||
|
|
sql = """
|
|||
|
|
SELECT m.id
|
|||
|
|
FROM hotel_main m
|
|||
|
|
LEFT JOIN hotel_raw_json r ON m.id = r.hotel_id
|
|||
|
|
WHERE r.hotel_id IS NULL
|
|||
|
|
ORDER BY m.id
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
if self.limit:
|
|||
|
|
sql += f" LIMIT {self.limit} OFFSET {self.offset}"
|
|||
|
|
|
|||
|
|
cur.execute(sql)
|
|||
|
|
ids = [row[0] for row in cur.fetchall()]
|
|||
|
|
cur.close()
|
|||
|
|
return ids
|
|||
|
|
|
|||
|
|
def api_request(self, url: str) -> Optional[Dict]:
|
|||
|
|
"""API запрос"""
|
|||
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|||
|
|
try:
|
|||
|
|
response = self.session.get(url, timeout=30)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
return response.json()
|
|||
|
|
except:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def get_hotel_details(self, hotel_id: str) -> Dict:
|
|||
|
|
"""Получить детали отеля"""
|
|||
|
|
return {
|
|||
|
|
'hotel_id': hotel_id,
|
|||
|
|
'main': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"),
|
|||
|
|
'additional_info': self.api_request(f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"),
|
|||
|
|
'sanatorium': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"),
|
|||
|
|
'drawer': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def save_batch(self, batch: List[Dict]):
|
|||
|
|
"""Сохранить батч с переподключением"""
|
|||
|
|
if not batch:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Переподключаемся перед каждым сохранением
|
|||
|
|
self.reconnect_db()
|
|||
|
|
cur = self.conn.cursor()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Сохраняем в hotel_raw_json
|
|||
|
|
records = [(item['hotel_id'], Json(item['main']), Json(item['additional_info']),
|
|||
|
|
Json(item['sanatorium']), Json(item['drawer'])) for item in batch]
|
|||
|
|
|
|||
|
|
sql = """
|
|||
|
|
INSERT INTO hotel_raw_json
|
|||
|
|
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
main_data = EXCLUDED.main_data,
|
|||
|
|
additional_info = EXCLUDED.additional_info,
|
|||
|
|
sanatorium_data = EXCLUDED.sanatorium_data,
|
|||
|
|
drawer_data = EXCLUDED.drawer_data
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
|||
|
|
self.conn.commit()
|
|||
|
|
logger.info(f"✓ Сохранено {len(batch)} отелей")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Ошибка сохранения: {e}")
|
|||
|
|
self.conn.rollback()
|
|||
|
|
self.errors += len(batch)
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
def run(self):
|
|||
|
|
"""Запуск"""
|
|||
|
|
start = datetime.now()
|
|||
|
|
logger.info(f"🚀 Запуск парсинга НЕОБРАБОТАННЫХ отелей")
|
|||
|
|
|
|||
|
|
# Получаем список необработанных
|
|||
|
|
hotel_ids = self.get_missing_hotel_ids()
|
|||
|
|
total = len(hotel_ids)
|
|||
|
|
logger.info(f"📊 Необработанных отелей: {total}")
|
|||
|
|
|
|||
|
|
if total == 0:
|
|||
|
|
logger.info("✅ Все отели уже обработаны!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
batch = []
|
|||
|
|
|
|||
|
|
for idx, hotel_id in enumerate(hotel_ids, 1):
|
|||
|
|
try:
|
|||
|
|
details = self.get_hotel_details(hotel_id)
|
|||
|
|
batch.append(details)
|
|||
|
|
self.processed += 1
|
|||
|
|
|
|||
|
|
# Сохраняем батч
|
|||
|
|
if len(batch) >= BATCH_SIZE:
|
|||
|
|
self.save_batch(batch)
|
|||
|
|
batch = []
|
|||
|
|
|
|||
|
|
# Прогресс
|
|||
|
|
if idx % 100 == 0:
|
|||
|
|
elapsed = (datetime.now() - start).total_seconds()
|
|||
|
|
speed = self.processed / elapsed
|
|||
|
|
eta_min = (total - idx) / speed / 60
|
|||
|
|
logger.info(f"Progress: {idx}/{total} ({idx/total*100:.1f}%) | "
|
|||
|
|
f"Speed: {speed:.1f}/sec | ETA: {eta_min:.0f} min")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Ошибка обработки {hotel_id}: {e}")
|
|||
|
|
self.errors += 1
|
|||
|
|
|
|||
|
|
# Остаток
|
|||
|
|
if batch:
|
|||
|
|
self.save_batch(batch)
|
|||
|
|
|
|||
|
|
elapsed = (datetime.now() - start).total_seconds()
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"Завершено: {self.processed}/{total}")
|
|||
|
|
logger.info(f"Ошибок: {self.errors}")
|
|||
|
|
logger.info(f"Время: {elapsed/60:.1f} минут")
|
|||
|
|
logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import sys
|
|||
|
|
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
|||
|
|
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
|
|||
|
|
|
|||
|
|
scraper = MissingScraper(limit=limit, offset=offset)
|
|||
|
|
scraper.run()
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|