Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
481
scraper_detailed.py
Normal file
481
scraper_detailed.py
Normal file
@@ -0,0 +1,481 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Сбор детальной информации по всем отелям
|
||||
Собирает данные из 4 endpoint'ов для каждого отеля
|
||||
"""
|
||||
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch, Json
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
from typing import Optional, Dict, List
|
||||
import json
|
||||
|
||||
# Настройка логирования
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'scraper_detailed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Параметры подключения к БД
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
|
||||
RATE_LIMIT_DELAY = 0.1 # 10 запросов в секунду (осторожно)
|
||||
BATCH_SIZE = 100
|
||||
CHECKPOINT_INTERVAL = 1000 # Чаще checkpoint для длительного процесса
|
||||
|
||||
|
||||
class DetailedScraper:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; HotelDataCollector/1.0)'
|
||||
})
|
||||
self.processed_count = 0
|
||||
self.error_count = 0
|
||||
self.start_time = None
|
||||
|
||||
def connect_db(self):
|
||||
"""Подключение к базе данных"""
|
||||
self.conn = psycopg2.connect(**DB_CONFIG)
|
||||
logger.info("✓ Подключено к базе данных")
|
||||
|
||||
def api_request(self, url: str) -> Optional[Dict]:
|
||||
"""Безопасный запрос к API с rate limiting"""
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.debug(f"API request failed: {url} - {e}")
|
||||
return None
|
||||
|
||||
def get_hotel_ids(self, limit=None, offset=0):
|
||||
"""Получить ID всех отелей из базы"""
|
||||
cur = self.conn.cursor()
|
||||
sql = "SELECT id FROM hotel_main ORDER BY id"
|
||||
if limit:
|
||||
sql += f" LIMIT {limit} OFFSET {offset}"
|
||||
cur.execute(sql)
|
||||
ids = [row[0] for row in cur.fetchall()]
|
||||
cur.close()
|
||||
return ids
|
||||
|
||||
def get_detailed_info(self, hotel_id: str) -> Dict:
|
||||
"""Получить детальную информацию об отеле"""
|
||||
result = {
|
||||
'hotel_id': hotel_id,
|
||||
'main': None,
|
||||
'additional_info': None,
|
||||
'sanatorium': None,
|
||||
'drawer': None
|
||||
}
|
||||
|
||||
# Main info
|
||||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"
|
||||
result['main'] = self.api_request(url)
|
||||
|
||||
# Additional info
|
||||
url = f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"
|
||||
result['additional_info'] = self.api_request(url)
|
||||
|
||||
# Sanatorium info
|
||||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"
|
||||
result['sanatorium'] = self.api_request(url)
|
||||
|
||||
# Drawer (услуги)
|
||||
url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer"
|
||||
result['drawer'] = self.api_request(url)
|
||||
|
||||
return result
|
||||
|
||||
def save_main_updates(self, data_list: List[Dict]):
|
||||
"""Обновить основную таблицу hotel_main"""
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
cur = self.conn.cursor()
|
||||
updates = []
|
||||
|
||||
for item in data_list:
|
||||
main = item.get('main')
|
||||
if not main:
|
||||
continue
|
||||
|
||||
updates.append((
|
||||
main.get('shortName'),
|
||||
main.get('phone'),
|
||||
main.get('email'),
|
||||
main.get('websiteAddress'),
|
||||
main.get('ownerFullName'),
|
||||
item['hotel_id']
|
||||
))
|
||||
|
||||
if updates:
|
||||
sql = """
|
||||
UPDATE hotel_main SET
|
||||
short_name = %s,
|
||||
phone = %s,
|
||||
email = %s,
|
||||
website_address = %s,
|
||||
owner_full_name = %s,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = %s
|
||||
"""
|
||||
execute_batch(cur, sql, updates, page_size=BATCH_SIZE)
|
||||
self.conn.commit()
|
||||
|
||||
cur.close()
|
||||
|
||||
def save_additional_info(self, data_list: List[Dict]):
|
||||
"""Сохранить дополнительную информацию"""
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
cur = self.conn.cursor()
|
||||
records = []
|
||||
|
||||
for item in data_list:
|
||||
info = item.get('additional_info')
|
||||
if not info:
|
||||
continue
|
||||
|
||||
records.append((
|
||||
item['hotel_id'],
|
||||
info.get('ownerOgrn'),
|
||||
info.get('ownerInn'),
|
||||
info.get('ownerKpp'),
|
||||
info.get('ownerShortName'),
|
||||
info.get('ownerPhone'),
|
||||
info.get('ownerEmail'),
|
||||
info.get('resortFullName'),
|
||||
info.get('ownerAddressName'),
|
||||
info.get('ownerLegalTypeId'),
|
||||
info.get('phone'),
|
||||
info.get('email')
|
||||
))
|
||||
|
||||
if records:
|
||||
sql = """
|
||||
INSERT INTO hotel_additional_info
|
||||
(hotel_id, owner_ogrn, owner_inn, owner_kpp, owner_short_name,
|
||||
owner_phone, owner_email, resort_full_name, owner_address_name,
|
||||
owner_legal_type_id, phone, email)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
owner_ogrn = EXCLUDED.owner_ogrn,
|
||||
owner_inn = EXCLUDED.owner_inn,
|
||||
owner_kpp = EXCLUDED.owner_kpp,
|
||||
owner_short_name = EXCLUDED.owner_short_name,
|
||||
owner_phone = EXCLUDED.owner_phone,
|
||||
owner_email = EXCLUDED.owner_email,
|
||||
resort_full_name = EXCLUDED.resort_full_name,
|
||||
owner_address_name = EXCLUDED.owner_address_name,
|
||||
owner_legal_type_id = EXCLUDED.owner_legal_type_id,
|
||||
phone = EXCLUDED.phone,
|
||||
email = EXCLUDED.email
|
||||
"""
|
||||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||||
self.conn.commit()
|
||||
|
||||
cur.close()
|
||||
|
||||
def save_sanatorium_info(self, data_list: List[Dict]):
|
||||
"""Сохранить санаторную информацию"""
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
cur = self.conn.cursor()
|
||||
records = []
|
||||
|
||||
for item in data_list:
|
||||
san = item.get('sanatorium')
|
||||
if not san or not isinstance(san, dict) or 'sanatoriumInfo' not in san:
|
||||
continue
|
||||
|
||||
info = san.get('sanatoriumInfo', {})
|
||||
if not info:
|
||||
continue
|
||||
records.append((
|
||||
item['hotel_id'],
|
||||
info.get('oid'),
|
||||
info.get('fullName'),
|
||||
info.get('shortName'),
|
||||
info.get('ogrn'),
|
||||
info.get('inn'),
|
||||
info.get('legalAddress'),
|
||||
info.get('actualAddress'),
|
||||
info.get('phone'),
|
||||
info.get('email'),
|
||||
info.get('webSite'),
|
||||
info.get('medicalLicense'),
|
||||
info.get('farmLicense'),
|
||||
info.get('terrenkur'),
|
||||
info.get('resortName'),
|
||||
info.get('hasWaterSupply'),
|
||||
info.get('hasHeating'),
|
||||
info.get('hasSewage'),
|
||||
info.get('hasAirConditioning'),
|
||||
info.get('hasElevator'),
|
||||
info.get('hasTelephone'),
|
||||
info.get('hasInternet'),
|
||||
info.get('hasMobilityLift'),
|
||||
info.get('hasGym'),
|
||||
info.get('hasConferenceRoom'),
|
||||
Json(san.get('swimmingPoolInfo')),
|
||||
Json(san.get('plageInfo')),
|
||||
Json(san.get('landDocumentInfo')),
|
||||
Json(san.get('roomsInfo'))
|
||||
))
|
||||
|
||||
if records:
|
||||
sql = """
|
||||
INSERT INTO hotel_sanatorium
|
||||
(hotel_id, oid, full_name, short_name, ogrn, inn, legal_address,
|
||||
actual_address, phone, email, web_site, medical_license, farm_license,
|
||||
terrenkur, resort_name, has_water_supply, has_heating, has_sewage,
|
||||
has_air_conditioning, has_elevator, has_telephone, has_internet,
|
||||
has_mobility_lift, has_gym, has_conference_room,
|
||||
swimming_pool_info, plage_info, land_document_info, rooms_info)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
oid = EXCLUDED.oid,
|
||||
full_name = EXCLUDED.full_name,
|
||||
short_name = EXCLUDED.short_name,
|
||||
ogrn = EXCLUDED.ogrn,
|
||||
inn = EXCLUDED.inn,
|
||||
legal_address = EXCLUDED.legal_address,
|
||||
actual_address = EXCLUDED.actual_address,
|
||||
phone = EXCLUDED.phone,
|
||||
email = EXCLUDED.email,
|
||||
web_site = EXCLUDED.web_site,
|
||||
medical_license = EXCLUDED.medical_license,
|
||||
farm_license = EXCLUDED.farm_license,
|
||||
terrenkur = EXCLUDED.terrenkur,
|
||||
resort_name = EXCLUDED.resort_name,
|
||||
has_water_supply = EXCLUDED.has_water_supply,
|
||||
has_heating = EXCLUDED.has_heating,
|
||||
has_sewage = EXCLUDED.has_sewage,
|
||||
has_air_conditioning = EXCLUDED.has_air_conditioning,
|
||||
has_elevator = EXCLUDED.has_elevator,
|
||||
has_telephone = EXCLUDED.has_telephone,
|
||||
has_internet = EXCLUDED.has_internet,
|
||||
has_mobility_lift = EXCLUDED.has_mobility_lift,
|
||||
has_gym = EXCLUDED.has_gym,
|
||||
has_conference_room = EXCLUDED.has_conference_room,
|
||||
swimming_pool_info = EXCLUDED.swimming_pool_info,
|
||||
plage_info = EXCLUDED.plage_info,
|
||||
land_document_info = EXCLUDED.land_document_info,
|
||||
rooms_info = EXCLUDED.rooms_info
|
||||
"""
|
||||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||||
self.conn.commit()
|
||||
|
||||
cur.close()
|
||||
|
||||
def save_services_and_rooms(self, data_list: List[Dict]):
|
||||
"""Сохранить услуги и номера из drawer"""
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
cur = self.conn.cursor()
|
||||
|
||||
for item in data_list:
|
||||
drawer = item.get('drawer')
|
||||
if not drawer or not isinstance(drawer, dict):
|
||||
continue
|
||||
|
||||
hotel_id = item['hotel_id']
|
||||
|
||||
# Услуги
|
||||
services = []
|
||||
for service_group in drawer.get('hotelServiceInfoList', []):
|
||||
cat_id = service_group.get('id')
|
||||
cat_name = service_group.get('name')
|
||||
for service in service_group.get('servicesList', []):
|
||||
services.append((
|
||||
hotel_id,
|
||||
cat_id,
|
||||
cat_name,
|
||||
service.get('id'),
|
||||
service.get('name')
|
||||
))
|
||||
|
||||
if services:
|
||||
sql = """
|
||||
INSERT INTO hotel_services
|
||||
(hotel_id, service_category_id, service_category_name, service_id, service_name)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id, service_id) DO NOTHING
|
||||
"""
|
||||
execute_batch(cur, sql, services, page_size=200)
|
||||
|
||||
# Номера
|
||||
rooms = []
|
||||
for room in drawer.get('roomInfoList', []):
|
||||
rooms.append((
|
||||
hotel_id,
|
||||
room.get('roomCategory', {}).get('id'),
|
||||
room.get('roomCategory', {}).get('name'),
|
||||
room.get('apartmentCount'),
|
||||
room.get('numberSeats'),
|
||||
Json(room.get('equipmentList', [])),
|
||||
room.get('familyRoomCount'),
|
||||
room.get('disabilityRoomCount')
|
||||
))
|
||||
|
||||
if rooms:
|
||||
sql = """
|
||||
INSERT INTO hotel_rooms
|
||||
(hotel_id, room_category_id, room_category_name, apartment_count,
|
||||
number_seats, equipment_list, family_room_count, disability_room_count)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
execute_batch(cur, sql, rooms, page_size=100)
|
||||
|
||||
self.conn.commit()
|
||||
cur.close()
|
||||
|
||||
def save_raw_json(self, data_list: List[Dict]):
|
||||
"""Сохранить сырые JSON для backup"""
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
cur = self.conn.cursor()
|
||||
records = []
|
||||
|
||||
for item in data_list:
|
||||
records.append((
|
||||
item['hotel_id'],
|
||||
Json(item.get('main')),
|
||||
Json(item.get('additional_info')),
|
||||
Json(item.get('sanatorium')),
|
||||
Json(item.get('drawer'))
|
||||
))
|
||||
|
||||
sql = """
|
||||
INSERT INTO hotel_raw_json
|
||||
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
main_data = EXCLUDED.main_data,
|
||||
additional_info = EXCLUDED.additional_info,
|
||||
sanatorium_data = EXCLUDED.sanatorium_data,
|
||||
drawer_data = EXCLUDED.drawer_data
|
||||
"""
|
||||
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
|
||||
self.conn.commit()
|
||||
cur.close()
|
||||
|
||||
def run(self, limit=None, offset=0):
|
||||
"""Запустить сбор детальной информации"""
|
||||
self.start_time = datetime.now()
|
||||
self.connect_db()
|
||||
|
||||
# Получаем список ID отелей
|
||||
hotel_ids = self.get_hotel_ids(limit=limit, offset=offset)
|
||||
total = len(hotel_ids)
|
||||
|
||||
logger.info("=" * 70)
|
||||
logger.info(f"Запуск сбора детальной информации")
|
||||
logger.info(f"Отелей к обработке: {total}")
|
||||
logger.info(f"Начало: {self.start_time}")
|
||||
logger.info("=" * 70)
|
||||
|
||||
batch = []
|
||||
|
||||
try:
|
||||
for idx, hotel_id in enumerate(hotel_ids, 1):
|
||||
try:
|
||||
details = self.get_detailed_info(hotel_id)
|
||||
batch.append(details)
|
||||
self.processed_count += 1
|
||||
|
||||
# Сохраняем батч
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
self.save_batch(batch)
|
||||
batch = []
|
||||
|
||||
# Checkpoint и статистика
|
||||
if self.processed_count % CHECKPOINT_INTERVAL == 0:
|
||||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||||
rate = self.processed_count / elapsed
|
||||
remaining = (total - self.processed_count) / rate / 60
|
||||
logger.info(
|
||||
f"Progress: {self.processed_count}/{total} ({self.processed_count/total*100:.1f}%) | "
|
||||
f"Speed: {rate:.1f} hotels/sec | "
|
||||
f"ETA: {remaining:.1f} min | "
|
||||
f"Errors: {self.error_count}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing hotel {hotel_id}: {e}")
|
||||
self.error_count += 1
|
||||
|
||||
# Сохраняем остаток
|
||||
if batch:
|
||||
self.save_batch(batch)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("\n⚠ Прервано пользователем")
|
||||
if batch:
|
||||
self.save_batch(batch)
|
||||
|
||||
finally:
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
|
||||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||||
|
||||
logger.info("=" * 70)
|
||||
logger.info("Сбор детальной информации завершен")
|
||||
logger.info(f"Обработано: {self.processed_count}/{total} отелей")
|
||||
logger.info(f"Ошибок: {self.error_count}")
|
||||
logger.info(f"Время: {elapsed/60:.1f} минут")
|
||||
logger.info(f"Скорость: {self.processed_count/elapsed:.1f} отелей/сек")
|
||||
logger.info("=" * 70)
|
||||
|
||||
def save_batch(self, batch):
|
||||
"""Сохранить батч данных"""
|
||||
logger.debug(f"Сохраняю батч из {len(batch)} отелей...")
|
||||
try:
|
||||
self.save_main_updates(batch)
|
||||
self.save_additional_info(batch)
|
||||
self.save_sanatorium_info(batch)
|
||||
self.save_services_and_rooms(batch)
|
||||
self.save_raw_json(batch)
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка сохранения батча: {e}")
|
||||
self.error_count += len(batch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
||||
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
|
||||
|
||||
logger.info(f"Параметры: limit={limit or 'all'}, offset={offset}")
|
||||
|
||||
scraper = DetailedScraper()
|
||||
scraper.run(limit=limit, offset=offset)
|
||||
|
||||
Reference in New Issue
Block a user