Files
hotels/scraper_detailed.py

482 lines
18 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Сбор детальной информации по всем отелям
Собирает данные из 4 endpoint'ов для каждого отеля
"""
import requests
import psycopg2
from psycopg2.extras import execute_batch, Json
import time
import logging
from datetime import datetime
from urllib.parse import unquote
from typing import Optional, Dict, List
import json
# Настройка логирования
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'scraper_detailed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Параметры подключения к БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
RATE_LIMIT_DELAY = 0.1 # 10 запросов в секунду (осторожно)
BATCH_SIZE = 100
CHECKPOINT_INTERVAL = 1000 # Чаще checkpoint для длительного процесса
class DetailedScraper:
def __init__(self):
self.conn = None
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; HotelDataCollector/1.0)'
})
self.processed_count = 0
self.error_count = 0
self.start_time = None
def connect_db(self):
"""Подключение к базе данных"""
self.conn = psycopg2.connect(**DB_CONFIG)
logger.info("✓ Подключено к базе данных")
def api_request(self, url: str) -> Optional[Dict]:
"""Безопасный запрос к API с rate limiting"""
time.sleep(RATE_LIMIT_DELAY)
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.json()
except Exception as e:
logger.debug(f"API request failed: {url} - {e}")
return None
def get_hotel_ids(self, limit=None, offset=0):
"""Получить ID всех отелей из базы"""
cur = self.conn.cursor()
sql = "SELECT id FROM hotel_main ORDER BY id"
if limit:
sql += f" LIMIT {limit} OFFSET {offset}"
cur.execute(sql)
ids = [row[0] for row in cur.fetchall()]
cur.close()
return ids
def get_detailed_info(self, hotel_id: str) -> Dict:
"""Получить детальную информацию об отеле"""
result = {
'hotel_id': hotel_id,
'main': None,
'additional_info': None,
'sanatorium': None,
'drawer': None
}
# Main info
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"
result['main'] = self.api_request(url)
# Additional info
url = f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"
result['additional_info'] = self.api_request(url)
# Sanatorium info
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"
result['sanatorium'] = self.api_request(url)
# Drawer (услуги)
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer"
result['drawer'] = self.api_request(url)
return result
def save_main_updates(self, data_list: List[Dict]):
"""Обновить основную таблицу hotel_main"""
if not data_list:
return
cur = self.conn.cursor()
updates = []
for item in data_list:
main = item.get('main')
if not main:
continue
updates.append((
main.get('shortName'),
main.get('phone'),
main.get('email'),
main.get('websiteAddress'),
main.get('ownerFullName'),
item['hotel_id']
))
if updates:
sql = """
UPDATE hotel_main SET
short_name = %s,
phone = %s,
email = %s,
website_address = %s,
owner_full_name = %s,
updated_at = CURRENT_TIMESTAMP
WHERE id = %s
"""
execute_batch(cur, sql, updates, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_additional_info(self, data_list: List[Dict]):
"""Сохранить дополнительную информацию"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
info = item.get('additional_info')
if not info:
continue
records.append((
item['hotel_id'],
info.get('ownerOgrn'),
info.get('ownerInn'),
info.get('ownerKpp'),
info.get('ownerShortName'),
info.get('ownerPhone'),
info.get('ownerEmail'),
info.get('resortFullName'),
info.get('ownerAddressName'),
info.get('ownerLegalTypeId'),
info.get('phone'),
info.get('email')
))
if records:
sql = """
INSERT INTO hotel_additional_info
(hotel_id, owner_ogrn, owner_inn, owner_kpp, owner_short_name,
owner_phone, owner_email, resort_full_name, owner_address_name,
owner_legal_type_id, phone, email)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
owner_ogrn = EXCLUDED.owner_ogrn,
owner_inn = EXCLUDED.owner_inn,
owner_kpp = EXCLUDED.owner_kpp,
owner_short_name = EXCLUDED.owner_short_name,
owner_phone = EXCLUDED.owner_phone,
owner_email = EXCLUDED.owner_email,
resort_full_name = EXCLUDED.resort_full_name,
owner_address_name = EXCLUDED.owner_address_name,
owner_legal_type_id = EXCLUDED.owner_legal_type_id,
phone = EXCLUDED.phone,
email = EXCLUDED.email
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_sanatorium_info(self, data_list: List[Dict]):
"""Сохранить санаторную информацию"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
san = item.get('sanatorium')
if not san or not isinstance(san, dict) or 'sanatoriumInfo' not in san:
continue
info = san.get('sanatoriumInfo', {})
if not info:
continue
records.append((
item['hotel_id'],
info.get('oid'),
info.get('fullName'),
info.get('shortName'),
info.get('ogrn'),
info.get('inn'),
info.get('legalAddress'),
info.get('actualAddress'),
info.get('phone'),
info.get('email'),
info.get('webSite'),
info.get('medicalLicense'),
info.get('farmLicense'),
info.get('terrenkur'),
info.get('resortName'),
info.get('hasWaterSupply'),
info.get('hasHeating'),
info.get('hasSewage'),
info.get('hasAirConditioning'),
info.get('hasElevator'),
info.get('hasTelephone'),
info.get('hasInternet'),
info.get('hasMobilityLift'),
info.get('hasGym'),
info.get('hasConferenceRoom'),
Json(san.get('swimmingPoolInfo')),
Json(san.get('plageInfo')),
Json(san.get('landDocumentInfo')),
Json(san.get('roomsInfo'))
))
if records:
sql = """
INSERT INTO hotel_sanatorium
(hotel_id, oid, full_name, short_name, ogrn, inn, legal_address,
actual_address, phone, email, web_site, medical_license, farm_license,
terrenkur, resort_name, has_water_supply, has_heating, has_sewage,
has_air_conditioning, has_elevator, has_telephone, has_internet,
has_mobility_lift, has_gym, has_conference_room,
swimming_pool_info, plage_info, land_document_info, rooms_info)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
oid = EXCLUDED.oid,
full_name = EXCLUDED.full_name,
short_name = EXCLUDED.short_name,
ogrn = EXCLUDED.ogrn,
inn = EXCLUDED.inn,
legal_address = EXCLUDED.legal_address,
actual_address = EXCLUDED.actual_address,
phone = EXCLUDED.phone,
email = EXCLUDED.email,
web_site = EXCLUDED.web_site,
medical_license = EXCLUDED.medical_license,
farm_license = EXCLUDED.farm_license,
terrenkur = EXCLUDED.terrenkur,
resort_name = EXCLUDED.resort_name,
has_water_supply = EXCLUDED.has_water_supply,
has_heating = EXCLUDED.has_heating,
has_sewage = EXCLUDED.has_sewage,
has_air_conditioning = EXCLUDED.has_air_conditioning,
has_elevator = EXCLUDED.has_elevator,
has_telephone = EXCLUDED.has_telephone,
has_internet = EXCLUDED.has_internet,
has_mobility_lift = EXCLUDED.has_mobility_lift,
has_gym = EXCLUDED.has_gym,
has_conference_room = EXCLUDED.has_conference_room,
swimming_pool_info = EXCLUDED.swimming_pool_info,
plage_info = EXCLUDED.plage_info,
land_document_info = EXCLUDED.land_document_info,
rooms_info = EXCLUDED.rooms_info
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_services_and_rooms(self, data_list: List[Dict]):
"""Сохранить услуги и номера из drawer"""
if not data_list:
return
cur = self.conn.cursor()
for item in data_list:
drawer = item.get('drawer')
if not drawer or not isinstance(drawer, dict):
continue
hotel_id = item['hotel_id']
# Услуги
services = []
for service_group in drawer.get('hotelServiceInfoList', []):
cat_id = service_group.get('id')
cat_name = service_group.get('name')
for service in service_group.get('servicesList', []):
services.append((
hotel_id,
cat_id,
cat_name,
service.get('id'),
service.get('name')
))
if services:
sql = """
INSERT INTO hotel_services
(hotel_id, service_category_id, service_category_name, service_id, service_name)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id, service_id) DO NOTHING
"""
execute_batch(cur, sql, services, page_size=200)
# Номера
rooms = []
for room in drawer.get('roomInfoList', []):
rooms.append((
hotel_id,
room.get('roomCategory', {}).get('id'),
room.get('roomCategory', {}).get('name'),
room.get('apartmentCount'),
room.get('numberSeats'),
Json(room.get('equipmentList', [])),
room.get('familyRoomCount'),
room.get('disabilityRoomCount')
))
if rooms:
sql = """
INSERT INTO hotel_rooms
(hotel_id, room_category_id, room_category_name, apartment_count,
number_seats, equipment_list, family_room_count, disability_room_count)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""
execute_batch(cur, sql, rooms, page_size=100)
self.conn.commit()
cur.close()
def save_raw_json(self, data_list: List[Dict]):
"""Сохранить сырые JSON для backup"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
records.append((
item['hotel_id'],
Json(item.get('main')),
Json(item.get('additional_info')),
Json(item.get('sanatorium')),
Json(item.get('drawer'))
))
sql = """
INSERT INTO hotel_raw_json
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
main_data = EXCLUDED.main_data,
additional_info = EXCLUDED.additional_info,
sanatorium_data = EXCLUDED.sanatorium_data,
drawer_data = EXCLUDED.drawer_data
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def run(self, limit=None, offset=0):
"""Запустить сбор детальной информации"""
self.start_time = datetime.now()
self.connect_db()
# Получаем список ID отелей
hotel_ids = self.get_hotel_ids(limit=limit, offset=offset)
total = len(hotel_ids)
logger.info("=" * 70)
logger.info(f"Запуск сбора детальной информации")
logger.info(f"Отелей к обработке: {total}")
logger.info(f"Начало: {self.start_time}")
logger.info("=" * 70)
batch = []
try:
for idx, hotel_id in enumerate(hotel_ids, 1):
try:
details = self.get_detailed_info(hotel_id)
batch.append(details)
self.processed_count += 1
# Сохраняем батч
if len(batch) >= BATCH_SIZE:
self.save_batch(batch)
batch = []
# Checkpoint и статистика
if self.processed_count % CHECKPOINT_INTERVAL == 0:
elapsed = (datetime.now() - self.start_time).total_seconds()
rate = self.processed_count / elapsed
remaining = (total - self.processed_count) / rate / 60
logger.info(
f"Progress: {self.processed_count}/{total} ({self.processed_count/total*100:.1f}%) | "
f"Speed: {rate:.1f} hotels/sec | "
f"ETA: {remaining:.1f} min | "
f"Errors: {self.error_count}"
)
except Exception as e:
logger.error(f"Error processing hotel {hotel_id}: {e}")
self.error_count += 1
# Сохраняем остаток
if batch:
self.save_batch(batch)
except KeyboardInterrupt:
logger.info("\n⚠ Прервано пользователем")
if batch:
self.save_batch(batch)
finally:
if self.conn:
self.conn.close()
elapsed = (datetime.now() - self.start_time).total_seconds()
logger.info("=" * 70)
logger.info("Сбор детальной информации завершен")
logger.info(f"Обработано: {self.processed_count}/{total} отелей")
logger.info(f"Ошибок: {self.error_count}")
logger.info(f"Время: {elapsed/60:.1f} минут")
logger.info(f"Скорость: {self.processed_count/elapsed:.1f} отелей/сек")
logger.info("=" * 70)
def save_batch(self, batch):
"""Сохранить батч данных"""
logger.debug(f"Сохраняю батч из {len(batch)} отелей...")
try:
self.save_main_updates(batch)
self.save_additional_info(batch)
self.save_sanatorium_info(batch)
self.save_services_and_rooms(batch)
self.save_raw_json(batch)
except Exception as e:
logger.error(f"Ошибка сохранения батча: {e}")
self.error_count += len(batch)
if __name__ == "__main__":
import sys
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
logger.info(f"Параметры: limit={limit or 'all'}, offset={offset}")
scraper = DetailedScraper()
scraper.run(limit=limit, offset=offset)