Files
hotels/scraper_detailed.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

482 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Сбор детальной информации по всем отелям
Собирает данные из 4 endpoint'ов для каждого отеля
"""
import requests
import psycopg2
from psycopg2.extras import execute_batch, Json
import time
import logging
from datetime import datetime
from urllib.parse import unquote
from typing import Optional, Dict, List
import json
# Настройка логирования
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'scraper_detailed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Параметры подключения к БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
RATE_LIMIT_DELAY = 0.1 # 10 запросов в секунду (осторожно)
BATCH_SIZE = 100
CHECKPOINT_INTERVAL = 1000 # Чаще checkpoint для длительного процесса
class DetailedScraper:
def __init__(self):
self.conn = None
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; HotelDataCollector/1.0)'
})
self.processed_count = 0
self.error_count = 0
self.start_time = None
def connect_db(self):
"""Подключение к базе данных"""
self.conn = psycopg2.connect(**DB_CONFIG)
logger.info("✓ Подключено к базе данных")
def api_request(self, url: str) -> Optional[Dict]:
"""Безопасный запрос к API с rate limiting"""
time.sleep(RATE_LIMIT_DELAY)
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.json()
except Exception as e:
logger.debug(f"API request failed: {url} - {e}")
return None
def get_hotel_ids(self, limit=None, offset=0):
"""Получить ID всех отелей из базы"""
cur = self.conn.cursor()
sql = "SELECT id FROM hotel_main ORDER BY id"
if limit:
sql += f" LIMIT {limit} OFFSET {offset}"
cur.execute(sql)
ids = [row[0] for row in cur.fetchall()]
cur.close()
return ids
def get_detailed_info(self, hotel_id: str) -> Dict:
"""Получить детальную информацию об отеле"""
result = {
'hotel_id': hotel_id,
'main': None,
'additional_info': None,
'sanatorium': None,
'drawer': None
}
# Main info
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"
result['main'] = self.api_request(url)
# Additional info
url = f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"
result['additional_info'] = self.api_request(url)
# Sanatorium info
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"
result['sanatorium'] = self.api_request(url)
# Drawer (услуги)
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer"
result['drawer'] = self.api_request(url)
return result
def save_main_updates(self, data_list: List[Dict]):
"""Обновить основную таблицу hotel_main"""
if not data_list:
return
cur = self.conn.cursor()
updates = []
for item in data_list:
main = item.get('main')
if not main:
continue
updates.append((
main.get('shortName'),
main.get('phone'),
main.get('email'),
main.get('websiteAddress'),
main.get('ownerFullName'),
item['hotel_id']
))
if updates:
sql = """
UPDATE hotel_main SET
short_name = %s,
phone = %s,
email = %s,
website_address = %s,
owner_full_name = %s,
updated_at = CURRENT_TIMESTAMP
WHERE id = %s
"""
execute_batch(cur, sql, updates, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_additional_info(self, data_list: List[Dict]):
"""Сохранить дополнительную информацию"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
info = item.get('additional_info')
if not info:
continue
records.append((
item['hotel_id'],
info.get('ownerOgrn'),
info.get('ownerInn'),
info.get('ownerKpp'),
info.get('ownerShortName'),
info.get('ownerPhone'),
info.get('ownerEmail'),
info.get('resortFullName'),
info.get('ownerAddressName'),
info.get('ownerLegalTypeId'),
info.get('phone'),
info.get('email')
))
if records:
sql = """
INSERT INTO hotel_additional_info
(hotel_id, owner_ogrn, owner_inn, owner_kpp, owner_short_name,
owner_phone, owner_email, resort_full_name, owner_address_name,
owner_legal_type_id, phone, email)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
owner_ogrn = EXCLUDED.owner_ogrn,
owner_inn = EXCLUDED.owner_inn,
owner_kpp = EXCLUDED.owner_kpp,
owner_short_name = EXCLUDED.owner_short_name,
owner_phone = EXCLUDED.owner_phone,
owner_email = EXCLUDED.owner_email,
resort_full_name = EXCLUDED.resort_full_name,
owner_address_name = EXCLUDED.owner_address_name,
owner_legal_type_id = EXCLUDED.owner_legal_type_id,
phone = EXCLUDED.phone,
email = EXCLUDED.email
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_sanatorium_info(self, data_list: List[Dict]):
"""Сохранить санаторную информацию"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
san = item.get('sanatorium')
if not san or not isinstance(san, dict) or 'sanatoriumInfo' not in san:
continue
info = san.get('sanatoriumInfo', {})
if not info:
continue
records.append((
item['hotel_id'],
info.get('oid'),
info.get('fullName'),
info.get('shortName'),
info.get('ogrn'),
info.get('inn'),
info.get('legalAddress'),
info.get('actualAddress'),
info.get('phone'),
info.get('email'),
info.get('webSite'),
info.get('medicalLicense'),
info.get('farmLicense'),
info.get('terrenkur'),
info.get('resortName'),
info.get('hasWaterSupply'),
info.get('hasHeating'),
info.get('hasSewage'),
info.get('hasAirConditioning'),
info.get('hasElevator'),
info.get('hasTelephone'),
info.get('hasInternet'),
info.get('hasMobilityLift'),
info.get('hasGym'),
info.get('hasConferenceRoom'),
Json(san.get('swimmingPoolInfo')),
Json(san.get('plageInfo')),
Json(san.get('landDocumentInfo')),
Json(san.get('roomsInfo'))
))
if records:
sql = """
INSERT INTO hotel_sanatorium
(hotel_id, oid, full_name, short_name, ogrn, inn, legal_address,
actual_address, phone, email, web_site, medical_license, farm_license,
terrenkur, resort_name, has_water_supply, has_heating, has_sewage,
has_air_conditioning, has_elevator, has_telephone, has_internet,
has_mobility_lift, has_gym, has_conference_room,
swimming_pool_info, plage_info, land_document_info, rooms_info)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
oid = EXCLUDED.oid,
full_name = EXCLUDED.full_name,
short_name = EXCLUDED.short_name,
ogrn = EXCLUDED.ogrn,
inn = EXCLUDED.inn,
legal_address = EXCLUDED.legal_address,
actual_address = EXCLUDED.actual_address,
phone = EXCLUDED.phone,
email = EXCLUDED.email,
web_site = EXCLUDED.web_site,
medical_license = EXCLUDED.medical_license,
farm_license = EXCLUDED.farm_license,
terrenkur = EXCLUDED.terrenkur,
resort_name = EXCLUDED.resort_name,
has_water_supply = EXCLUDED.has_water_supply,
has_heating = EXCLUDED.has_heating,
has_sewage = EXCLUDED.has_sewage,
has_air_conditioning = EXCLUDED.has_air_conditioning,
has_elevator = EXCLUDED.has_elevator,
has_telephone = EXCLUDED.has_telephone,
has_internet = EXCLUDED.has_internet,
has_mobility_lift = EXCLUDED.has_mobility_lift,
has_gym = EXCLUDED.has_gym,
has_conference_room = EXCLUDED.has_conference_room,
swimming_pool_info = EXCLUDED.swimming_pool_info,
plage_info = EXCLUDED.plage_info,
land_document_info = EXCLUDED.land_document_info,
rooms_info = EXCLUDED.rooms_info
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def save_services_and_rooms(self, data_list: List[Dict]):
"""Сохранить услуги и номера из drawer"""
if not data_list:
return
cur = self.conn.cursor()
for item in data_list:
drawer = item.get('drawer')
if not drawer or not isinstance(drawer, dict):
continue
hotel_id = item['hotel_id']
# Услуги
services = []
for service_group in drawer.get('hotelServiceInfoList', []):
cat_id = service_group.get('id')
cat_name = service_group.get('name')
for service in service_group.get('servicesList', []):
services.append((
hotel_id,
cat_id,
cat_name,
service.get('id'),
service.get('name')
))
if services:
sql = """
INSERT INTO hotel_services
(hotel_id, service_category_id, service_category_name, service_id, service_name)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id, service_id) DO NOTHING
"""
execute_batch(cur, sql, services, page_size=200)
# Номера
rooms = []
for room in drawer.get('roomInfoList', []):
rooms.append((
hotel_id,
room.get('roomCategory', {}).get('id'),
room.get('roomCategory', {}).get('name'),
room.get('apartmentCount'),
room.get('numberSeats'),
Json(room.get('equipmentList', [])),
room.get('familyRoomCount'),
room.get('disabilityRoomCount')
))
if rooms:
sql = """
INSERT INTO hotel_rooms
(hotel_id, room_category_id, room_category_name, apartment_count,
number_seats, equipment_list, family_room_count, disability_room_count)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""
execute_batch(cur, sql, rooms, page_size=100)
self.conn.commit()
cur.close()
def save_raw_json(self, data_list: List[Dict]):
"""Сохранить сырые JSON для backup"""
if not data_list:
return
cur = self.conn.cursor()
records = []
for item in data_list:
records.append((
item['hotel_id'],
Json(item.get('main')),
Json(item.get('additional_info')),
Json(item.get('sanatorium')),
Json(item.get('drawer'))
))
sql = """
INSERT INTO hotel_raw_json
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
main_data = EXCLUDED.main_data,
additional_info = EXCLUDED.additional_info,
sanatorium_data = EXCLUDED.sanatorium_data,
drawer_data = EXCLUDED.drawer_data
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
cur.close()
def run(self, limit=None, offset=0):
"""Запустить сбор детальной информации"""
self.start_time = datetime.now()
self.connect_db()
# Получаем список ID отелей
hotel_ids = self.get_hotel_ids(limit=limit, offset=offset)
total = len(hotel_ids)
logger.info("=" * 70)
logger.info(f"Запуск сбора детальной информации")
logger.info(f"Отелей к обработке: {total}")
logger.info(f"Начало: {self.start_time}")
logger.info("=" * 70)
batch = []
try:
for idx, hotel_id in enumerate(hotel_ids, 1):
try:
details = self.get_detailed_info(hotel_id)
batch.append(details)
self.processed_count += 1
# Сохраняем батч
if len(batch) >= BATCH_SIZE:
self.save_batch(batch)
batch = []
# Checkpoint и статистика
if self.processed_count % CHECKPOINT_INTERVAL == 0:
elapsed = (datetime.now() - self.start_time).total_seconds()
rate = self.processed_count / elapsed
remaining = (total - self.processed_count) / rate / 60
logger.info(
f"Progress: {self.processed_count}/{total} ({self.processed_count/total*100:.1f}%) | "
f"Speed: {rate:.1f} hotels/sec | "
f"ETA: {remaining:.1f} min | "
f"Errors: {self.error_count}"
)
except Exception as e:
logger.error(f"Error processing hotel {hotel_id}: {e}")
self.error_count += 1
# Сохраняем остаток
if batch:
self.save_batch(batch)
except KeyboardInterrupt:
logger.info("\n⚠ Прервано пользователем")
if batch:
self.save_batch(batch)
finally:
if self.conn:
self.conn.close()
elapsed = (datetime.now() - self.start_time).total_seconds()
logger.info("=" * 70)
logger.info("Сбор детальной информации завершен")
logger.info(f"Обработано: {self.processed_count}/{total} отелей")
logger.info(f"Ошибок: {self.error_count}")
logger.info(f"Время: {elapsed/60:.1f} минут")
logger.info(f"Скорость: {self.processed_count/elapsed:.1f} отелей/сек")
logger.info("=" * 70)
def save_batch(self, batch):
"""Сохранить батч данных"""
logger.debug(f"Сохраняю батч из {len(batch)} отелей...")
try:
self.save_main_updates(batch)
self.save_additional_info(batch)
self.save_sanatorium_info(batch)
self.save_services_and_rooms(batch)
self.save_raw_json(batch)
except Exception as e:
logger.error(f"Ошибка сохранения батча: {e}")
self.error_count += len(batch)
if __name__ == "__main__":
import sys
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
logger.info(f"Параметры: limit={limit or 'all'}, offset={offset}")
scraper = DetailedScraper()
scraper.run(limit=limit, offset=offset)