528 lines
22 KiB
Python
528 lines
22 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Универсальный краулер для парсинга сайтов отелей с проверкой РКН
|
|||
|
|
- Парсит сайт отеля (главная + depth 1)
|
|||
|
|
- Сразу проверяет ИНН в реестре Роскомнадзора
|
|||
|
|
- Сохраняет все данные в PostgreSQL
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import Json
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import List, Dict, Set, Optional
|
|||
|
|
from urllib.parse import urljoin, urlparse, unquote
|
|||
|
|
from playwright.async_api import async_playwright, Page
|
|||
|
|
from bs4 import BeautifulSoup, Comment
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Конфигурация краулинга
|
|||
|
|
MAX_PAGES_PER_SITE = 20
|
|||
|
|
PAGE_TIMEOUT = 45000
|
|||
|
|
RKN_CHECK_DELAY = 2 # Задержка перед проверкой РКН
|
|||
|
|
|
|||
|
|
# Типичные URL для проверки (важные страницы отелей)
|
|||
|
|
TYPICAL_URLS = [
|
|||
|
|
'/pravila', '/rules', '/terms', '/conditions',
|
|||
|
|
'/services', '/uslugi', '/price', '/prices', '/ceny',
|
|||
|
|
'/booking', '/book', '/bronirование', '/reserve',
|
|||
|
|
'/faq', '/contacts', '/kontakty', '/about', '/o-nas',
|
|||
|
|
'/policy', '/politika', '/privacy', '/oferta', '/offer',
|
|||
|
|
'/dogovor', '/contract', '/agreement', '/soglashenie',
|
|||
|
|
'/reviews', '/otzyvy', '/gallery', '/galereya',
|
|||
|
|
'/rooms', '/nomera', '/accommodation', '/razmeshenie'
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextCleaner:
|
|||
|
|
"""Простая очистка HTML"""
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def clean_html(cls, html: str) -> str:
|
|||
|
|
"""Простая очистка HTML"""
|
|||
|
|
if not html:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты и стили
|
|||
|
|
for tag in soup.find_all(['script', 'style']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
# Получаем чистый текст
|
|||
|
|
text = soup.get_text()
|
|||
|
|
|
|||
|
|
# Очистка текста
|
|||
|
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
|
text = re.sub(r'\n\s*\n', '\n', text)
|
|||
|
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|||
|
|
|
|||
|
|
return '\n'.join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class UniversalCrawler:
|
|||
|
|
"""Универсальный краулер с проверкой РКН"""
|
|||
|
|
|
|||
|
|
def __init__(self, region_name: str):
|
|||
|
|
self.region_name = region_name
|
|||
|
|
self.visited_urls: Set[str] = set()
|
|||
|
|
self.db_conn = None
|
|||
|
|
self.rkn_page = None
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
log_filename = f'crawler_{region_name.replace(" ", "_")}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
|||
|
|
self.logger = logging.getLogger(f'crawler_{region_name}')
|
|||
|
|
self.logger.setLevel(logging.INFO)
|
|||
|
|
|
|||
|
|
# Хендлеры
|
|||
|
|
fh = logging.FileHandler(log_filename)
|
|||
|
|
ch = logging.StreamHandler()
|
|||
|
|
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|||
|
|
fh.setFormatter(formatter)
|
|||
|
|
ch.setFormatter(formatter)
|
|||
|
|
self.logger.addHandler(fh)
|
|||
|
|
self.logger.addHandler(ch)
|
|||
|
|
|
|||
|
|
async def connect_db(self):
|
|||
|
|
"""Подключение к БД"""
|
|||
|
|
try:
|
|||
|
|
self.db_conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
self.logger.info("✓ Подключено к PostgreSQL")
|
|||
|
|
|
|||
|
|
# Добавляем колонки для РКН (если их нет)
|
|||
|
|
cur = self.db_conn.cursor()
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_status VARCHAR(50);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_number VARCHAR(50);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_date VARCHAR(20);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_checked_at TIMESTAMP;')
|
|||
|
|
self.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"✗ Ошибка подключения к БД: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def close_db(self):
|
|||
|
|
"""Закрытие соединения с БД"""
|
|||
|
|
if self.db_conn:
|
|||
|
|
self.db_conn.close()
|
|||
|
|
|
|||
|
|
async def check_rkn_registry(self, inn: str, browser) -> Dict:
|
|||
|
|
"""Проверка ИНН в реестре РКН"""
|
|||
|
|
if not inn or inn == '-':
|
|||
|
|
return {'found': False, 'status': 'no_inn'}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Создаем отдельную страницу для РКН
|
|||
|
|
if not self.rkn_page:
|
|||
|
|
self.rkn_page = await browser.new_page()
|
|||
|
|
await self.rkn_page.set_viewport_size({"width": 1920, "height": 1080})
|
|||
|
|
await self.rkn_page.set_extra_http_headers({
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
url = f'https://pd.rkn.gov.ru/operators-registry/operators-list/?act=search&inn={inn}'
|
|||
|
|
|
|||
|
|
self.logger.info(f" 🔍 РКН: проверка ИНН {inn}")
|
|||
|
|
|
|||
|
|
# Задержка перед запросом
|
|||
|
|
await asyncio.sleep(RKN_CHECK_DELAY)
|
|||
|
|
|
|||
|
|
# Загружаем страницу
|
|||
|
|
response = await self.rkn_page.goto(url, timeout=30000, wait_until='networkidle')
|
|||
|
|
|
|||
|
|
if response.status != 200:
|
|||
|
|
return {'found': False, 'status': 'error'}
|
|||
|
|
|
|||
|
|
await asyncio.sleep(1)
|
|||
|
|
|
|||
|
|
# Получаем текст
|
|||
|
|
text = await self.rkn_page.evaluate('() => document.body.innerText')
|
|||
|
|
|
|||
|
|
# Проверяем результаты
|
|||
|
|
if 'Не найдено' in text or 'не найдено' in text.lower():
|
|||
|
|
self.logger.info(f" ❌ РКН: не найден")
|
|||
|
|
return {'found': False, 'status': 'not_found'}
|
|||
|
|
|
|||
|
|
# Извлекаем данные (разные форматы: 41-14-000746 или 10-0107355)
|
|||
|
|
reg_number_match = re.search(r'(\d{2}-\d{2,4}-\d{6,7})', text)
|
|||
|
|
reg_number = reg_number_match.group(1) if reg_number_match else None
|
|||
|
|
|
|||
|
|
date_match = re.search(r'Приказ.*?(\d{2}\.\d{2}\.\d{4})', text)
|
|||
|
|
reg_date = date_match.group(1) if date_match else None
|
|||
|
|
|
|||
|
|
if reg_number:
|
|||
|
|
self.logger.info(f" ✅ РКН: найден {reg_number} ({reg_date})")
|
|||
|
|
return {
|
|||
|
|
'found': True,
|
|||
|
|
'status': 'found',
|
|||
|
|
'reg_number': reg_number,
|
|||
|
|
'reg_date': reg_date
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
self.logger.info(f" ⚠️ РКН: результат неясен")
|
|||
|
|
return {'found': None, 'status': 'unclear'}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f" ✗ РКН: ошибка {e}")
|
|||
|
|
return {'found': False, 'status': 'error'}
|
|||
|
|
|
|||
|
|
def save_rkn_result(self, hotel_id: str, result: Dict):
|
|||
|
|
"""Сохранение результата проверки РКН"""
|
|||
|
|
try:
|
|||
|
|
cur = self.db_conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute('''
|
|||
|
|
UPDATE hotel_main
|
|||
|
|
SET
|
|||
|
|
rkn_registry_status = %s,
|
|||
|
|
rkn_registry_number = %s,
|
|||
|
|
rkn_registry_date = %s,
|
|||
|
|
rkn_checked_at = %s
|
|||
|
|
WHERE id = %s
|
|||
|
|
''', (
|
|||
|
|
result['status'],
|
|||
|
|
result.get('reg_number'),
|
|||
|
|
result.get('reg_date'),
|
|||
|
|
datetime.now(),
|
|||
|
|
hotel_id
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
self.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f" ✗ Ошибка сохранения РКН: {e}")
|
|||
|
|
self.db_conn.rollback()
|
|||
|
|
|
|||
|
|
async def crawl_page(self, page: Page, url: str, hotel_id: str, depth: int = 0) -> Dict:
|
|||
|
|
"""Краулинг одной страницы"""
|
|||
|
|
try:
|
|||
|
|
self.logger.info(f" Парсинг (depth={depth}): {url[:60]}...")
|
|||
|
|
|
|||
|
|
response = await page.goto(url, timeout=PAGE_TIMEOUT, wait_until='networkidle')
|
|||
|
|
|
|||
|
|
if not response or response.status >= 400:
|
|||
|
|
self.logger.warning(f" ✗ Ошибка загрузки: {response.status if response else 'No response'}")
|
|||
|
|
return {'success': False, 'status_code': response.status if response else 0}
|
|||
|
|
|
|||
|
|
# Получаем HTML
|
|||
|
|
html = await page.content()
|
|||
|
|
|
|||
|
|
# Очищаем HTML
|
|||
|
|
clean_text = TextCleaner.clean_html(html)
|
|||
|
|
|
|||
|
|
# Получаем заголовок
|
|||
|
|
title = await page.title()
|
|||
|
|
|
|||
|
|
# Получаем Last-Modified из заголовков
|
|||
|
|
last_modified = response.headers.get('last-modified', None)
|
|||
|
|
|
|||
|
|
# Сохраняем в БД
|
|||
|
|
await self.save_to_db(hotel_id, url, title, html, clean_text, response.status, depth, last_modified)
|
|||
|
|
|
|||
|
|
self.logger.info(f" ✓ Сохранено {len(clean_text)} символов")
|
|||
|
|
|
|||
|
|
# Ищем внутренние ссылки
|
|||
|
|
internal_links = await self.find_internal_links(page, url)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'success': True,
|
|||
|
|
'status_code': response.status,
|
|||
|
|
'internal_links': internal_links
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f" ✗ Ошибка парсинга: {e}")
|
|||
|
|
return {'success': False, 'error': str(e)}
|
|||
|
|
|
|||
|
|
async def check_typical_urls(self, page: Page, base_url: str) -> List[str]:
|
|||
|
|
"""Проверяет типичные URL и возвращает существующие"""
|
|||
|
|
found_urls = []
|
|||
|
|
parsed_base = urlparse(base_url)
|
|||
|
|
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|||
|
|
|
|||
|
|
self.logger.info(f" 🔍 Проверка типичных URL...")
|
|||
|
|
|
|||
|
|
for typical_path in TYPICAL_URLS:
|
|||
|
|
typical_url = base_domain + typical_path
|
|||
|
|
|
|||
|
|
# Пропускаем если уже посетили
|
|||
|
|
if typical_url in self.visited_urls:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Пробуем загрузить страницу (быстро, timeout=5сек)
|
|||
|
|
response = await page.goto(typical_url, timeout=5000, wait_until='domcontentloaded')
|
|||
|
|
|
|||
|
|
if response and response.status == 200:
|
|||
|
|
found_urls.append(typical_url)
|
|||
|
|
self.logger.info(f" ✓ Найден: {typical_path}")
|
|||
|
|
|
|||
|
|
except Exception:
|
|||
|
|
# Страница не существует или недоступна - это нормально
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
self.logger.info(f" Найдено {len(found_urls)} типичных страниц")
|
|||
|
|
return found_urls
|
|||
|
|
|
|||
|
|
async def find_internal_links(self, page: Page, base_url: str) -> List[str]:
|
|||
|
|
"""Поиск внутренних ссылок"""
|
|||
|
|
try:
|
|||
|
|
links = await page.evaluate('() => Array.from(document.querySelectorAll("a[href]")).map(link => link.href)')
|
|||
|
|
|
|||
|
|
base_domain = urlparse(base_url).netloc
|
|||
|
|
internal_links = []
|
|||
|
|
|
|||
|
|
for link in links:
|
|||
|
|
try:
|
|||
|
|
parsed = urlparse(link)
|
|||
|
|
if parsed.netloc == base_domain and link not in self.visited_urls:
|
|||
|
|
internal_links.append(link)
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
internal_links = internal_links[:MAX_PAGES_PER_SITE]
|
|||
|
|
self.logger.info(f" Найдено {len(internal_links)} внутренних ссылок")
|
|||
|
|
return internal_links
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f" ✗ Ошибка поиска ссылок: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
async def save_to_db(self, hotel_id: str, url: str, title: str, html: str,
|
|||
|
|
clean_text: str, status_code: int, depth: int, last_modified: str = None):
|
|||
|
|
"""Сохранение данных в БД"""
|
|||
|
|
try:
|
|||
|
|
cur = self.db_conn.cursor()
|
|||
|
|
|
|||
|
|
# Проверяем есть ли уже эта страница
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT id FROM hotel_website_raw
|
|||
|
|
WHERE hotel_id = %s AND url = %s
|
|||
|
|
''', (hotel_id, url))
|
|||
|
|
|
|||
|
|
if cur.fetchone():
|
|||
|
|
# Страница уже есть - пропускаем
|
|||
|
|
cur.close()
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Парсим last_modified в datetime если есть
|
|||
|
|
last_modified_dt = None
|
|||
|
|
if last_modified:
|
|||
|
|
try:
|
|||
|
|
from email.utils import parsedate_to_datetime
|
|||
|
|
last_modified_dt = parsedate_to_datetime(last_modified)
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.warning(f" ⚠️ Не удалось распарсить Last-Modified: {e}")
|
|||
|
|
|
|||
|
|
# Сохраняем сырые данные
|
|||
|
|
cur.execute('''
|
|||
|
|
INSERT INTO hotel_website_raw
|
|||
|
|
(hotel_id, url, page_title, html, status_code, response_time_ms, depth, crawled_at, last_modified)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
''', (hotel_id, url, title, html, status_code, 0, depth, datetime.now(), last_modified_dt))
|
|||
|
|
|
|||
|
|
# Сохраняем метаданные
|
|||
|
|
cur.execute('''
|
|||
|
|
INSERT INTO hotel_website_meta
|
|||
|
|
(hotel_id, domain, main_url, pages_crawled, pages_failed, total_size_bytes,
|
|||
|
|
internal_links_found, crawl_status, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
pages_crawled = hotel_website_meta.pages_crawled + 1,
|
|||
|
|
total_size_bytes = hotel_website_meta.total_size_bytes + %s,
|
|||
|
|
crawl_finished_at = %s,
|
|||
|
|
updated_at = CURRENT_TIMESTAMP
|
|||
|
|
''', (
|
|||
|
|
hotel_id, urlparse(url).netloc, url, 1, 0, len(clean_text), 0,
|
|||
|
|
'completed', datetime.now(), datetime.now(),
|
|||
|
|
len(clean_text), datetime.now()
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
self.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f" ✗ Ошибка сохранения в БД: {e}")
|
|||
|
|
self.db_conn.rollback()
|
|||
|
|
|
|||
|
|
async def crawl_hotel(self, hotel_data: Dict, browser) -> Dict:
|
|||
|
|
"""Краулинг одного отеля + проверка РКН"""
|
|||
|
|
hotel_id = hotel_data['id']
|
|||
|
|
hotel_name = hotel_data['full_name']
|
|||
|
|
website_url = hotel_data.get('website_address')
|
|||
|
|
owner_inn = hotel_data.get('owner_inn')
|
|||
|
|
|
|||
|
|
self.logger.info(f"\n{'='*70}")
|
|||
|
|
self.logger.info(f"🏨 {hotel_name}")
|
|||
|
|
self.logger.info(f"🌐 {website_url or 'Нет сайта'}")
|
|||
|
|
if owner_inn:
|
|||
|
|
self.logger.info(f"🔢 ИНН: {owner_inn}")
|
|||
|
|
self.logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
if not website_url or website_url in ['-', 'Нет сайта', '']:
|
|||
|
|
self.logger.info(" ⏭️ Пропуск - нет сайта")
|
|||
|
|
return {'success': False, 'reason': 'no_website'}
|
|||
|
|
|
|||
|
|
# Нормализуем URL
|
|||
|
|
if not website_url.startswith(('http://', 'https://')):
|
|||
|
|
website_url = 'https://' + website_url
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
page = await browser.new_page()
|
|||
|
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|||
|
|
await page.set_extra_http_headers({
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 1. Краулинг главной страницы
|
|||
|
|
result = await self.crawl_page(page, website_url, hotel_id, depth=0)
|
|||
|
|
|
|||
|
|
if not result['success']:
|
|||
|
|
await page.close()
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
# 2. Проверка типичных URL (правила, цены, контакты и т.д.)
|
|||
|
|
typical_urls = await self.check_typical_urls(page, website_url)
|
|||
|
|
|
|||
|
|
# 3. Проверка в реестре РКН (если есть ИНН и сайт доступен)
|
|||
|
|
if owner_inn:
|
|||
|
|
rkn_result = await self.check_rkn_registry(owner_inn, browser)
|
|||
|
|
self.save_rkn_result(hotel_id, rkn_result)
|
|||
|
|
|
|||
|
|
# 4. Краулинг типичных страниц
|
|||
|
|
pages_crawled = 1
|
|||
|
|
for typical_url in typical_urls:
|
|||
|
|
if typical_url not in self.visited_urls:
|
|||
|
|
self.visited_urls.add(typical_url)
|
|||
|
|
await self.crawl_page(page, typical_url, hotel_id, depth=1)
|
|||
|
|
pages_crawled += 1
|
|||
|
|
|
|||
|
|
# 5. Краулинг остальных внутренних страниц (если есть место)
|
|||
|
|
internal_links = result.get('internal_links', [])
|
|||
|
|
remaining_slots = MAX_PAGES_PER_SITE - pages_crawled
|
|||
|
|
|
|||
|
|
for link in internal_links[:remaining_slots]:
|
|||
|
|
if link not in self.visited_urls:
|
|||
|
|
self.visited_urls.add(link)
|
|||
|
|
await self.crawl_page(page, link, hotel_id, depth=1)
|
|||
|
|
pages_crawled += 1
|
|||
|
|
|
|||
|
|
await page.close()
|
|||
|
|
|
|||
|
|
self.logger.info(f"✓ Спарсено {pages_crawled} страниц")
|
|||
|
|
return {'success': True, 'pages_crawled': pages_crawled}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"✗ Ошибка краулинга: {e}")
|
|||
|
|
return {'success': False, 'error': str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def main():
|
|||
|
|
"""Основная функция"""
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print("Использование: python universal_crawler.py <регион>")
|
|||
|
|
print("Пример: python universal_crawler.py 'Камчатский край'")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
region_name = sys.argv[1]
|
|||
|
|
|
|||
|
|
crawler = UniversalCrawler(region_name)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Подключаемся к БД
|
|||
|
|
await crawler.connect_db()
|
|||
|
|
|
|||
|
|
# Получаем отели региона с сайтами
|
|||
|
|
cur = crawler.db_conn.cursor()
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT id, full_name, website_address, owner_inn
|
|||
|
|
FROM hotel_main
|
|||
|
|
WHERE region_name ILIKE %s
|
|||
|
|
AND website_address IS NOT NULL
|
|||
|
|
AND website_address != '-'
|
|||
|
|
AND website_address != ''
|
|||
|
|
ORDER BY full_name
|
|||
|
|
''', (f'%{region_name}%',))
|
|||
|
|
|
|||
|
|
hotels = [{'id': row[0], 'full_name': row[1], 'website_address': row[2], 'owner_inn': row[3]}
|
|||
|
|
for row in cur.fetchall()]
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
crawler.logger.info(f"\n{'='*70}")
|
|||
|
|
crawler.logger.info(f"🚀 ЗАПУСК КРАУЛИНГА: {region_name}")
|
|||
|
|
crawler.logger.info(f"📊 Отелей с сайтами: {len(hotels)}")
|
|||
|
|
crawler.logger.info(f"⏱️ Примерное время: {len(hotels) * (5 + RKN_CHECK_DELAY) / 60:.1f} минут")
|
|||
|
|
crawler.logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
# Открываем браузер один раз для всех отелей
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(headless=True)
|
|||
|
|
|
|||
|
|
# Краулинг отелей
|
|||
|
|
successful = 0
|
|||
|
|
failed = 0
|
|||
|
|
rkn_found = 0
|
|||
|
|
rkn_not_found = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
crawler.logger.info(f"\n[{i}/{len(hotels)}] {'='*35}")
|
|||
|
|
|
|||
|
|
result = await crawler.crawl_hotel(hotel, browser)
|
|||
|
|
|
|||
|
|
if result['success']:
|
|||
|
|
successful += 1
|
|||
|
|
else:
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
# Подсчитываем результаты РКН
|
|||
|
|
cur = crawler.db_conn.cursor()
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT
|
|||
|
|
COUNT(CASE WHEN rkn_registry_status = 'found' THEN 1 END) as found,
|
|||
|
|
COUNT(CASE WHEN rkn_registry_status = 'not_found' THEN 1 END) as not_found,
|
|||
|
|
COUNT(CASE WHEN rkn_registry_status = 'unclear' THEN 1 END) as unclear
|
|||
|
|
FROM hotel_main
|
|||
|
|
WHERE region_name ILIKE %s
|
|||
|
|
''', (f'%{region_name}%',))
|
|||
|
|
|
|||
|
|
rkn_stats = cur.fetchone()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
# Итоги
|
|||
|
|
crawler.logger.info(f"\n{'='*70}")
|
|||
|
|
crawler.logger.info("📊 ИТОГИ КРАУЛИНГА:")
|
|||
|
|
crawler.logger.info(f" ✅ Успешно: {successful}/{len(hotels)}")
|
|||
|
|
crawler.logger.info(f" ✗ Ошибки: {failed}/{len(hotels)}")
|
|||
|
|
crawler.logger.info(f"\n📋 ИТОГИ ПРОВЕРКИ РКН:")
|
|||
|
|
crawler.logger.info(f" ✅ Найдено в реестре: {rkn_stats[0]}")
|
|||
|
|
crawler.logger.info(f" ❌ Не найдено: {rkn_stats[1]}")
|
|||
|
|
crawler.logger.info(f" ❓ Неясно: {rkn_stats[2]}")
|
|||
|
|
crawler.logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
crawler.logger.error(f"❌ Критическая ошибка: {e}")
|
|||
|
|
finally:
|
|||
|
|
crawler.close_db()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|
|||
|
|
|