- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
482 lines
18 KiB
Python
482 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Сбор детальной информации по всем отелям
|
||
Собирает данные из 4 endpoint'ов для каждого отеля
|
||
"""
|
||
|
||
import requests
|
||
import psycopg2
|
||
from psycopg2.extras import execute_batch, Json
|
||
import time
|
||
import logging
|
||
from datetime import datetime
|
||
from urllib.parse import unquote
|
||
from typing import Optional, Dict, List
|
||
import json
|
||
|
||
# Настройка логирования
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(f'scraper_detailed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Параметры подключения к БД
|
||
DB_CONFIG = {
|
||
'host': "147.45.189.234",
|
||
'port': 5432,
|
||
'database': "default_db",
|
||
'user': "gen_user",
|
||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||
}
|
||
|
||
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
|
||
RATE_LIMIT_DELAY = 0.1 # 10 запросов в секунду (осторожно)
|
||
BATCH_SIZE = 100
|
||
CHECKPOINT_INTERVAL = 1000 # Чаще checkpoint для длительного процесса
|
||
|
||
|
||
class DetailedScraper:
|
||
def __init__(self):
|
||
self.conn = None
|
||
self.session = requests.Session()
|
||
self.session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (compatible; HotelDataCollector/1.0)'
|
||
})
|
||
self.processed_count = 0
|
||
self.error_count = 0
|
||
self.start_time = None
|
||
|
||
def connect_db(self):
|
||
"""Подключение к базе данных"""
|
||
self.conn = psycopg2.connect(**DB_CONFIG)
|
||
logger.info("✓ Подключено к базе данных")
|
||
|
||
def api_request(self, url: str) -> Optional[Dict]:
|
||
"""Безопасный запрос к API с rate limiting"""
|
||
time.sleep(RATE_LIMIT_DELAY)
|
||
|
||
try:
|
||
response = self.session.get(url, timeout=30)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
except Exception as e:
|
||
logger.debug(f"API request failed: {url} - {e}")
|
||
return None
|
||
|
||
def get_hotel_ids(self, limit=None, offset=0):
|
||
"""Получить ID всех отелей из базы"""
|
||
cur = self.conn.cursor()
|
||
sql = "SELECT id FROM hotel_main ORDER BY id"
|
||
if limit:
|
||
sql += f" LIMIT {limit} OFFSET {offset}"
|
||
cur.execute(sql)
|
||
ids = [row[0] for row in cur.fetchall()]
|
||
cur.close()
|
||
return ids
|
||
|
||
def get_detailed_info(self, hotel_id: str) -> Dict:
|
||
"""Получить детальную информацию об отеле"""
|
||
result = {
|
||
'hotel_id': hotel_id,
|
||
'main': None,
|
||
'additional_info': None,
|
||
'sanatorium': None,
|
||
'drawer': None
|
||
}
|
||
|
||
# Main info
|
||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"
|
||
result['main'] = self.api_request(url)
|
||
|
||
# Additional info
|
||
url = f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"
|
||
result['additional_info'] = self.api_request(url)
|
||
|
||
# Sanatorium info
|
||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"
|
||
result['sanatorium'] = self.api_request(url)
|
||
|
||
# Drawer (услуги)
|
||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer"
|
||
result['drawer'] = self.api_request(url)
|
||
|
||
return result
|
||
|
||
def save_main_updates(self, data_list: List[Dict]):
|
||
"""Обновить основную таблицу hotel_main"""
|
||
if not data_list:
|
||
return
|
||
|
||
cur = self.conn.cursor()
|
||
updates = []
|
||
|
||
for item in data_list:
|
||
main = item.get('main')
|
||
if not main:
|
||
continue
|
||
|
||
updates.append((
|
||
main.get('shortName'),
|
||
main.get('phone'),
|
||
main.get('email'),
|
||
main.get('websiteAddress'),
|
||
main.get('ownerFullName'),
|
||
item['hotel_id']
|
||
))
|
||
|
||
if updates:
|
||
sql = """
|
||
UPDATE hotel_main SET
|
||
short_name = %s,
|
||
phone = %s,
|
||
email = %s,
|
||
website_address = %s,
|
||
owner_full_name = %s,
|
||
updated_at = CURRENT_TIMESTAMP
|
||
WHERE id = %s
|
||
"""
|
||
execute_batch(cur, sql, updates, page_size=BATCH_SIZE)
|
||
self.conn.commit()
|
||
|
||
cur.close()
|
||
|
||
def save_additional_info(self, data_list: List[Dict]):
|
||
"""Сохранить дополнительную информацию"""
|
||
if not data_list:
|
||
return
|
||
|
||
cur = self.conn.cursor()
|
||
records = []
|
||
|
||
for item in data_list:
|
||
info = item.get('additional_info')
|
||
if not info:
|
||
continue
|
||
|
||
records.append((
|
||
item['hotel_id'],
|
||
info.get('ownerOgrn'),
|
||
info.get('ownerInn'),
|
||
info.get('ownerKpp'),
|
||
info.get('ownerShortName'),
|
||
info.get('ownerPhone'),
|
||
info.get('ownerEmail'),
|
||
info.get('resortFullName'),
|
||
info.get('ownerAddressName'),
|
||
info.get('ownerLegalTypeId'),
|
||
info.get('phone'),
|
||
info.get('email')
|
||
))
|
||
|
||
if records:
|
||
sql = """
|
||
INSERT INTO hotel_additional_info
|
||
(hotel_id, owner_ogrn, owner_inn, owner_kpp, owner_short_name,
|
||
owner_phone, owner_email, resort_full_name, owner_address_name,
|
||
owner_legal_type_id, phone, email)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
owner_ogrn = EXCLUDED.owner_ogrn,
|
||
owner_inn = EXCLUDED.owner_inn,
|
||
owner_kpp = EXCLUDED.owner_kpp,
|
||
owner_short_name = EXCLUDED.owner_short_name,
|
||
owner_phone = EXCLUDED.owner_phone,
|
||
owner_email = EXCLUDED.owner_email,
|
||
resort_full_name = EXCLUDED.resort_full_name,
|
||
owner_address_name = EXCLUDED.owner_address_name,
|
||
owner_legal_type_id = EXCLUDED.owner_legal_type_id,
|
||
phone = EXCLUDED.phone,
|
||
email = EXCLUDED.email
|
||
"""
|
||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||
self.conn.commit()
|
||
|
||
cur.close()
|
||
|
||
def save_sanatorium_info(self, data_list: List[Dict]):
|
||
"""Сохранить санаторную информацию"""
|
||
if not data_list:
|
||
return
|
||
|
||
cur = self.conn.cursor()
|
||
records = []
|
||
|
||
for item in data_list:
|
||
san = item.get('sanatorium')
|
||
if not san or not isinstance(san, dict) or 'sanatoriumInfo' not in san:
|
||
continue
|
||
|
||
info = san.get('sanatoriumInfo', {})
|
||
if not info:
|
||
continue
|
||
records.append((
|
||
item['hotel_id'],
|
||
info.get('oid'),
|
||
info.get('fullName'),
|
||
info.get('shortName'),
|
||
info.get('ogrn'),
|
||
info.get('inn'),
|
||
info.get('legalAddress'),
|
||
info.get('actualAddress'),
|
||
info.get('phone'),
|
||
info.get('email'),
|
||
info.get('webSite'),
|
||
info.get('medicalLicense'),
|
||
info.get('farmLicense'),
|
||
info.get('terrenkur'),
|
||
info.get('resortName'),
|
||
info.get('hasWaterSupply'),
|
||
info.get('hasHeating'),
|
||
info.get('hasSewage'),
|
||
info.get('hasAirConditioning'),
|
||
info.get('hasElevator'),
|
||
info.get('hasTelephone'),
|
||
info.get('hasInternet'),
|
||
info.get('hasMobilityLift'),
|
||
info.get('hasGym'),
|
||
info.get('hasConferenceRoom'),
|
||
Json(san.get('swimmingPoolInfo')),
|
||
Json(san.get('plageInfo')),
|
||
Json(san.get('landDocumentInfo')),
|
||
Json(san.get('roomsInfo'))
|
||
))
|
||
|
||
if records:
|
||
sql = """
|
||
INSERT INTO hotel_sanatorium
|
||
(hotel_id, oid, full_name, short_name, ogrn, inn, legal_address,
|
||
actual_address, phone, email, web_site, medical_license, farm_license,
|
||
terrenkur, resort_name, has_water_supply, has_heating, has_sewage,
|
||
has_air_conditioning, has_elevator, has_telephone, has_internet,
|
||
has_mobility_lift, has_gym, has_conference_room,
|
||
swimming_pool_info, plage_info, land_document_info, rooms_info)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
||
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
oid = EXCLUDED.oid,
|
||
full_name = EXCLUDED.full_name,
|
||
short_name = EXCLUDED.short_name,
|
||
ogrn = EXCLUDED.ogrn,
|
||
inn = EXCLUDED.inn,
|
||
legal_address = EXCLUDED.legal_address,
|
||
actual_address = EXCLUDED.actual_address,
|
||
phone = EXCLUDED.phone,
|
||
email = EXCLUDED.email,
|
||
web_site = EXCLUDED.web_site,
|
||
medical_license = EXCLUDED.medical_license,
|
||
farm_license = EXCLUDED.farm_license,
|
||
terrenkur = EXCLUDED.terrenkur,
|
||
resort_name = EXCLUDED.resort_name,
|
||
has_water_supply = EXCLUDED.has_water_supply,
|
||
has_heating = EXCLUDED.has_heating,
|
||
has_sewage = EXCLUDED.has_sewage,
|
||
has_air_conditioning = EXCLUDED.has_air_conditioning,
|
||
has_elevator = EXCLUDED.has_elevator,
|
||
has_telephone = EXCLUDED.has_telephone,
|
||
has_internet = EXCLUDED.has_internet,
|
||
has_mobility_lift = EXCLUDED.has_mobility_lift,
|
||
has_gym = EXCLUDED.has_gym,
|
||
has_conference_room = EXCLUDED.has_conference_room,
|
||
swimming_pool_info = EXCLUDED.swimming_pool_info,
|
||
plage_info = EXCLUDED.plage_info,
|
||
land_document_info = EXCLUDED.land_document_info,
|
||
rooms_info = EXCLUDED.rooms_info
|
||
"""
|
||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||
self.conn.commit()
|
||
|
||
cur.close()
|
||
|
||
def save_services_and_rooms(self, data_list: List[Dict]):
|
||
"""Сохранить услуги и номера из drawer"""
|
||
if not data_list:
|
||
return
|
||
|
||
cur = self.conn.cursor()
|
||
|
||
for item in data_list:
|
||
drawer = item.get('drawer')
|
||
if not drawer or not isinstance(drawer, dict):
|
||
continue
|
||
|
||
hotel_id = item['hotel_id']
|
||
|
||
# Услуги
|
||
services = []
|
||
for service_group in drawer.get('hotelServiceInfoList', []):
|
||
cat_id = service_group.get('id')
|
||
cat_name = service_group.get('name')
|
||
for service in service_group.get('servicesList', []):
|
||
services.append((
|
||
hotel_id,
|
||
cat_id,
|
||
cat_name,
|
||
service.get('id'),
|
||
service.get('name')
|
||
))
|
||
|
||
if services:
|
||
sql = """
|
||
INSERT INTO hotel_services
|
||
(hotel_id, service_category_id, service_category_name, service_id, service_name)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id, service_id) DO NOTHING
|
||
"""
|
||
execute_batch(cur, sql, services, page_size=200)
|
||
|
||
# Номера
|
||
rooms = []
|
||
for room in drawer.get('roomInfoList', []):
|
||
rooms.append((
|
||
hotel_id,
|
||
room.get('roomCategory', {}).get('id'),
|
||
room.get('roomCategory', {}).get('name'),
|
||
room.get('apartmentCount'),
|
||
room.get('numberSeats'),
|
||
Json(room.get('equipmentList', [])),
|
||
room.get('familyRoomCount'),
|
||
room.get('disabilityRoomCount')
|
||
))
|
||
|
||
if rooms:
|
||
sql = """
|
||
INSERT INTO hotel_rooms
|
||
(hotel_id, room_category_id, room_category_name, apartment_count,
|
||
number_seats, equipment_list, family_room_count, disability_room_count)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||
"""
|
||
execute_batch(cur, sql, rooms, page_size=100)
|
||
|
||
self.conn.commit()
|
||
cur.close()
|
||
|
||
def save_raw_json(self, data_list: List[Dict]):
|
||
"""Сохранить сырые JSON для backup"""
|
||
if not data_list:
|
||
return
|
||
|
||
cur = self.conn.cursor()
|
||
records = []
|
||
|
||
for item in data_list:
|
||
records.append((
|
||
item['hotel_id'],
|
||
Json(item.get('main')),
|
||
Json(item.get('additional_info')),
|
||
Json(item.get('sanatorium')),
|
||
Json(item.get('drawer'))
|
||
))
|
||
|
||
sql = """
|
||
INSERT INTO hotel_raw_json
|
||
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
main_data = EXCLUDED.main_data,
|
||
additional_info = EXCLUDED.additional_info,
|
||
sanatorium_data = EXCLUDED.sanatorium_data,
|
||
drawer_data = EXCLUDED.drawer_data
|
||
"""
|
||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||
self.conn.commit()
|
||
cur.close()
|
||
|
||
def run(self, limit=None, offset=0):
|
||
"""Запустить сбор детальной информации"""
|
||
self.start_time = datetime.now()
|
||
self.connect_db()
|
||
|
||
# Получаем список ID отелей
|
||
hotel_ids = self.get_hotel_ids(limit=limit, offset=offset)
|
||
total = len(hotel_ids)
|
||
|
||
logger.info("=" * 70)
|
||
logger.info(f"Запуск сбора детальной информации")
|
||
logger.info(f"Отелей к обработке: {total}")
|
||
logger.info(f"Начало: {self.start_time}")
|
||
logger.info("=" * 70)
|
||
|
||
batch = []
|
||
|
||
try:
|
||
for idx, hotel_id in enumerate(hotel_ids, 1):
|
||
try:
|
||
details = self.get_detailed_info(hotel_id)
|
||
batch.append(details)
|
||
self.processed_count += 1
|
||
|
||
# Сохраняем батч
|
||
if len(batch) >= BATCH_SIZE:
|
||
self.save_batch(batch)
|
||
batch = []
|
||
|
||
# Checkpoint и статистика
|
||
if self.processed_count % CHECKPOINT_INTERVAL == 0:
|
||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||
rate = self.processed_count / elapsed
|
||
remaining = (total - self.processed_count) / rate / 60
|
||
logger.info(
|
||
f"Progress: {self.processed_count}/{total} ({self.processed_count/total*100:.1f}%) | "
|
||
f"Speed: {rate:.1f} hotels/sec | "
|
||
f"ETA: {remaining:.1f} min | "
|
||
f"Errors: {self.error_count}"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error processing hotel {hotel_id}: {e}")
|
||
self.error_count += 1
|
||
|
||
# Сохраняем остаток
|
||
if batch:
|
||
self.save_batch(batch)
|
||
|
||
except KeyboardInterrupt:
|
||
logger.info("\n⚠ Прервано пользователем")
|
||
if batch:
|
||
self.save_batch(batch)
|
||
|
||
finally:
|
||
if self.conn:
|
||
self.conn.close()
|
||
|
||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||
|
||
logger.info("=" * 70)
|
||
logger.info("Сбор детальной информации завершен")
|
||
logger.info(f"Обработано: {self.processed_count}/{total} отелей")
|
||
logger.info(f"Ошибок: {self.error_count}")
|
||
logger.info(f"Время: {elapsed/60:.1f} минут")
|
||
logger.info(f"Скорость: {self.processed_count/elapsed:.1f} отелей/сек")
|
||
logger.info("=" * 70)
|
||
|
||
def save_batch(self, batch):
|
||
"""Сохранить батч данных"""
|
||
logger.debug(f"Сохраняю батч из {len(batch)} отелей...")
|
||
try:
|
||
self.save_main_updates(batch)
|
||
self.save_additional_info(batch)
|
||
self.save_sanatorium_info(batch)
|
||
self.save_services_and_rooms(batch)
|
||
self.save_raw_json(batch)
|
||
except Exception as e:
|
||
logger.error(f"Ошибка сохранения батча: {e}")
|
||
self.error_count += len(batch)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
||
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
|
||
|
||
logger.info(f"Параметры: limit={limit or 'all'}, offset={offset}")
|
||
|
||
scraper = DetailedScraper()
|
||
scraper.run(limit=limit, offset=offset)
|
||
|