531 lines
21 KiB
Python
531 lines
21 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Crawler для парсинга сайтов отелей Камчатского края с сохранением в PostgreSQL
|
|||
|
|
- Сохраняет сырой HTML (для будущей переобработки)
|
|||
|
|
- Сохраняет очищенный текст
|
|||
|
|
- Извлекает структурированные данные
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import Json
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import List, Dict, Set, Optional
|
|||
|
|
from urllib.parse import urljoin, urlparse, unquote
|
|||
|
|
from playwright.async_api import async_playwright, Page
|
|||
|
|
from bs4 import BeautifulSoup, Comment
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'kamchatka_crawler_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Конфигурация краулинга
|
|||
|
|
MAX_PAGES_PER_SITE = 20
|
|||
|
|
PAGE_TIMEOUT = 45000
|
|||
|
|
NAVIGATION_TIMEOUT = 40000
|
|||
|
|
GROUP_ID = "hotel_kamchatka" # Для Graphiti
|
|||
|
|
RKN_CHECK_DELAY = 2 # Задержка перед проверкой РКН (секунды)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextCleaner:
|
|||
|
|
"""Продвинутая очистка HTML с сохранением важных данных"""
|
|||
|
|
|
|||
|
|
# Теги для удаления (только мусор!)
|
|||
|
|
REMOVE_TAGS = {
|
|||
|
|
'script', 'style', 'meta', 'link', 'noscript', 'iframe', 'embed', 'object',
|
|||
|
|
'form', 'input', 'button', 'select', 'textarea', 'label',
|
|||
|
|
'canvas', 'svg', 'img', 'video', 'audio', 'source', 'track',
|
|||
|
|
'map', 'area', 'base', 'head', 'title'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Теги для сохранения контента (но удаления тега)
|
|||
|
|
PRESERVE_CONTENT_TAGS = {
|
|||
|
|
'div', 'span', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|||
|
|
'a', 'strong', 'b', 'em', 'i', 'u', 's', 'strike', 'del',
|
|||
|
|
'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'table', 'tr', 'td', 'th',
|
|||
|
|
'blockquote', 'pre', 'code', 'br', 'hr'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Ключевые слова для сохранения контента
|
|||
|
|
CONTACT_KEYWORDS = {
|
|||
|
|
'телефон', 'phone', 'тел', 'контакт', 'contact', 'адрес', 'address',
|
|||
|
|
'email', 'почта', 'mail', 'факс', 'fax', 'инн', 'огрн', 'inn', 'ogrn',
|
|||
|
|
'режим работы', 'часы работы', 'working hours', 'время работы'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def clean_html(cls, html: str) -> str:
|
|||
|
|
"""Простая очистка HTML"""
|
|||
|
|
if not html:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты и стили
|
|||
|
|
for tag in soup.find_all(['script', 'style']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
# Получаем чистый текст
|
|||
|
|
text = soup.get_text()
|
|||
|
|
|
|||
|
|
# Очистка текста
|
|||
|
|
text = cls._clean_text(text)
|
|||
|
|
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def _clean_text(cls, text: str) -> str:
|
|||
|
|
"""Дополнительная очистка текста"""
|
|||
|
|
# Удаляем лишние пробелы и переносы
|
|||
|
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
|
text = re.sub(r'\n\s*\n', '\n', text)
|
|||
|
|
|
|||
|
|
# Удаляем пустые строки
|
|||
|
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|||
|
|
|
|||
|
|
return '\n'.join(lines)
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def extract_structured_data(cls, text: str) -> Dict[str, List[str]]:
|
|||
|
|
"""Извлечение структурированных данных из текста"""
|
|||
|
|
data = {
|
|||
|
|
'phones': [],
|
|||
|
|
'emails': [],
|
|||
|
|
'inns': [],
|
|||
|
|
'ogrn': []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Телефоны
|
|||
|
|
phone_patterns = [
|
|||
|
|
r'\+?[78][\s\-\(\)]?\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}',
|
|||
|
|
r'\+?7[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}',
|
|||
|
|
r'\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}'
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for pattern in phone_patterns:
|
|||
|
|
matches = re.findall(pattern, text)
|
|||
|
|
data['phones'].extend(matches)
|
|||
|
|
|
|||
|
|
# Email
|
|||
|
|
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|||
|
|
data['emails'] = re.findall(email_pattern, text)
|
|||
|
|
|
|||
|
|
# ИНН
|
|||
|
|
inn_pattern = r'\b\d{10,12}\b'
|
|||
|
|
inns = re.findall(inn_pattern, text)
|
|||
|
|
data['inns'] = [inn for inn in inns if len(inn) in [10, 12]]
|
|||
|
|
|
|||
|
|
# ОГРН
|
|||
|
|
ogrn_pattern = r'\b\d{13,15}\b'
|
|||
|
|
ogrns = re.findall(ogrn_pattern, text)
|
|||
|
|
data['ogrn'] = [ogrn for ogrn in ogrns if len(ogrn) in [13, 15]]
|
|||
|
|
|
|||
|
|
# Удаляем дубликаты
|
|||
|
|
for key in data:
|
|||
|
|
data[key] = list(set(data[key]))
|
|||
|
|
|
|||
|
|
return data
|
|||
|
|
|
|||
|
|
|
|||
|
|
class WebsiteCrawler:
|
|||
|
|
"""Краулер для сайтов отелей"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.visited_urls: Set[str] = set()
|
|||
|
|
self.db_conn = None
|
|||
|
|
self.rkn_page = None # Отдельная страница для проверки РКН
|
|||
|
|
|
|||
|
|
async def connect_db(self):
|
|||
|
|
"""Подключение к БД"""
|
|||
|
|
try:
|
|||
|
|
self.db_conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
logger.info(" ✓ Подключено к PostgreSQL")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка подключения к БД: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def close_db(self):
|
|||
|
|
"""Закрытие соединения с БД"""
|
|||
|
|
if self.db_conn:
|
|||
|
|
self.db_conn.close()
|
|||
|
|
|
|||
|
|
async def check_rkn_registry(self, inn: str, browser) -> Dict:
|
|||
|
|
"""Проверка ИНН в реестре РКН"""
|
|||
|
|
if not inn or inn == '-':
|
|||
|
|
return {
|
|||
|
|
'found': False,
|
|||
|
|
'status': 'no_inn',
|
|||
|
|
'message': 'ИНН не указан'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Создаем отдельную страницу для РКН
|
|||
|
|
if not self.rkn_page:
|
|||
|
|
self.rkn_page = await browser.new_page()
|
|||
|
|
await self.rkn_page.set_viewport_size({"width": 1920, "height": 1080})
|
|||
|
|
await self.rkn_page.set_extra_http_headers({
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
url = f'https://pd.rkn.gov.ru/operators-registry/operators-list/?act=search&inn={inn}'
|
|||
|
|
|
|||
|
|
logger.info(f" 🔍 РКН: проверка ИНН {inn}")
|
|||
|
|
|
|||
|
|
# Задержка перед запросом
|
|||
|
|
await asyncio.sleep(RKN_CHECK_DELAY)
|
|||
|
|
|
|||
|
|
# Загружаем страницу
|
|||
|
|
response = await self.rkn_page.goto(url, timeout=30000, wait_until='networkidle')
|
|||
|
|
|
|||
|
|
if response.status != 200:
|
|||
|
|
return {'found': False, 'status': 'error', 'message': f'HTTP {response.status}'}
|
|||
|
|
|
|||
|
|
await asyncio.sleep(1)
|
|||
|
|
|
|||
|
|
# Получаем текст
|
|||
|
|
text = await self.rkn_page.evaluate('() => document.body.innerText')
|
|||
|
|
|
|||
|
|
# Проверяем результаты
|
|||
|
|
if 'Не найдено' in text or 'не найдено' in text.lower():
|
|||
|
|
logger.info(f" ❌ РКН: не найден")
|
|||
|
|
return {'found': False, 'status': 'not_found', 'message': 'Не найден в реестре'}
|
|||
|
|
|
|||
|
|
# Извлекаем данные
|
|||
|
|
reg_number_match = re.search(r'(\d{2}-\d{2,4}-\d{6,7})', text)
|
|||
|
|
reg_number = reg_number_match.group(1) if reg_number_match else None
|
|||
|
|
|
|||
|
|
date_match = re.search(r'Приказ.*?(\d{2}\.\d{2}\.\d{4})', text)
|
|||
|
|
reg_date = date_match.group(1) if date_match else None
|
|||
|
|
|
|||
|
|
if reg_number:
|
|||
|
|
logger.info(f" ✅ РКН: найден {reg_number} ({reg_date})")
|
|||
|
|
return {
|
|||
|
|
'found': True,
|
|||
|
|
'status': 'found',
|
|||
|
|
'reg_number': reg_number,
|
|||
|
|
'reg_date': reg_date
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
logger.info(f" ⚠️ РКН: результат неясен")
|
|||
|
|
return {'found': None, 'status': 'unclear', 'message': 'Результат неясен'}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ РКН: ошибка {e}")
|
|||
|
|
return {'found': False, 'status': 'error', 'message': str(e)}
|
|||
|
|
|
|||
|
|
def save_rkn_result(self, hotel_id: str, result: Dict):
|
|||
|
|
"""Сохранение результата проверки РКН в БД"""
|
|||
|
|
try:
|
|||
|
|
cur = self.db_conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute('''
|
|||
|
|
UPDATE hotel_main
|
|||
|
|
SET
|
|||
|
|
rkn_registry_status = %s,
|
|||
|
|
rkn_registry_number = %s,
|
|||
|
|
rkn_registry_date = %s,
|
|||
|
|
rkn_checked_at = %s
|
|||
|
|
WHERE id = %s
|
|||
|
|
''', (
|
|||
|
|
result['status'],
|
|||
|
|
result.get('reg_number'),
|
|||
|
|
result.get('reg_date'),
|
|||
|
|
datetime.now(),
|
|||
|
|
hotel_id
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
self.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка сохранения РКН: {e}")
|
|||
|
|
self.db_conn.rollback()
|
|||
|
|
|
|||
|
|
async def crawl_page(self, page: Page, url: str, hotel_id: str, depth: int = 0) -> Dict:
|
|||
|
|
"""Краулинг одной страницы"""
|
|||
|
|
try:
|
|||
|
|
logger.info(f" Парсинг (depth={depth}): {url} ...")
|
|||
|
|
|
|||
|
|
# Переходим на страницу
|
|||
|
|
response = await page.goto(url, timeout=PAGE_TIMEOUT, wait_until='networkidle')
|
|||
|
|
|
|||
|
|
if not response or response.status >= 400:
|
|||
|
|
logger.warning(f" ✗ Ошибка загрузки: {response.status if response else 'No response'}")
|
|||
|
|
return {'success': False, 'status_code': response.status if response else 0}
|
|||
|
|
|
|||
|
|
# Получаем HTML
|
|||
|
|
html = await page.content()
|
|||
|
|
|
|||
|
|
# Очищаем HTML
|
|||
|
|
clean_text = TextCleaner.clean_html(html)
|
|||
|
|
|
|||
|
|
# Извлекаем структурированные данные
|
|||
|
|
structured_data = TextCleaner.extract_structured_data(clean_text)
|
|||
|
|
|
|||
|
|
# Получаем заголовок
|
|||
|
|
title = await page.title()
|
|||
|
|
|
|||
|
|
# Сохраняем в БД
|
|||
|
|
await self.save_to_db(
|
|||
|
|
hotel_id=hotel_id,
|
|||
|
|
url=url,
|
|||
|
|
title=title,
|
|||
|
|
html=html,
|
|||
|
|
clean_text=clean_text,
|
|||
|
|
structured_data=structured_data,
|
|||
|
|
status_code=response.status,
|
|||
|
|
depth=depth
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.info(f" ✓ Сохранено {len(clean_text)} символов в БД")
|
|||
|
|
|
|||
|
|
# Ищем внутренние ссылки
|
|||
|
|
internal_links = await self.find_internal_links(page, url)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'success': True,
|
|||
|
|
'status_code': response.status,
|
|||
|
|
'internal_links': internal_links,
|
|||
|
|
'text_length': len(clean_text)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка парсинга {url}: {e}")
|
|||
|
|
return {'success': False, 'error': str(e)}
|
|||
|
|
|
|||
|
|
async def find_internal_links(self, page: Page, base_url: str) -> List[str]:
|
|||
|
|
"""Поиск внутренних ссылок"""
|
|||
|
|
try:
|
|||
|
|
# Получаем все ссылки
|
|||
|
|
links = await page.evaluate('''
|
|||
|
|
() => {
|
|||
|
|
const links = Array.from(document.querySelectorAll('a[href]'));
|
|||
|
|
return links.map(link => link.href);
|
|||
|
|
}
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
# Фильтруем внутренние ссылки
|
|||
|
|
base_domain = urlparse(base_url).netloc
|
|||
|
|
internal_links = []
|
|||
|
|
|
|||
|
|
for link in links:
|
|||
|
|
try:
|
|||
|
|
parsed = urlparse(link)
|
|||
|
|
if parsed.netloc == base_domain and link not in self.visited_urls:
|
|||
|
|
internal_links.append(link)
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Ограничиваем количество ссылок
|
|||
|
|
internal_links = internal_links[:MAX_PAGES_PER_SITE]
|
|||
|
|
|
|||
|
|
logger.info(f" Найдено {len(internal_links)} внутренних ссылок")
|
|||
|
|
return internal_links
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка поиска ссылок: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
async def save_to_db(self, hotel_id: str, url: str, title: str, html: str,
|
|||
|
|
clean_text: str, structured_data: Dict, status_code: int, depth: int):
|
|||
|
|
"""Сохранение данных в БД"""
|
|||
|
|
try:
|
|||
|
|
cur = self.db_conn.cursor()
|
|||
|
|
|
|||
|
|
# Сохраняем сырые данные
|
|||
|
|
cur.execute('''
|
|||
|
|
INSERT INTO hotel_website_raw
|
|||
|
|
(hotel_id, url, page_title, html, status_code, response_time_ms, depth, crawled_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
''', (
|
|||
|
|
hotel_id, url, title, html, status_code, 0, depth, datetime.now()
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
# Сохраняем метаданные (используем правильную структуру таблицы)
|
|||
|
|
cur.execute('''
|
|||
|
|
INSERT INTO hotel_website_meta
|
|||
|
|
(hotel_id, domain, main_url, pages_crawled, pages_failed, total_size_bytes,
|
|||
|
|
internal_links_found, crawl_status, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
pages_crawled = hotel_website_meta.pages_crawled + 1,
|
|||
|
|
total_size_bytes = hotel_website_meta.total_size_bytes + %s,
|
|||
|
|
crawl_finished_at = %s,
|
|||
|
|
updated_at = CURRENT_TIMESTAMP
|
|||
|
|
''', (
|
|||
|
|
hotel_id,
|
|||
|
|
urlparse(url).netloc, # domain
|
|||
|
|
url, # main_url
|
|||
|
|
1, # pages_crawled
|
|||
|
|
0, # pages_failed
|
|||
|
|
len(clean_text), # total_size_bytes
|
|||
|
|
0, # internal_links_found (будет обновлено позже)
|
|||
|
|
'completed', # crawl_status
|
|||
|
|
datetime.now(), # crawl_started_at
|
|||
|
|
datetime.now(), # crawl_finished_at
|
|||
|
|
len(clean_text), # для ON CONFLICT
|
|||
|
|
datetime.now() # для ON CONFLICT
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
self.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка сохранения в БД: {e}")
|
|||
|
|
self.db_conn.rollback()
|
|||
|
|
|
|||
|
|
async def crawl_hotel(self, hotel_data: Dict) -> Dict:
|
|||
|
|
"""Краулинг одного отеля"""
|
|||
|
|
hotel_id = hotel_data['id']
|
|||
|
|
hotel_name = hotel_data['full_name']
|
|||
|
|
website_url = hotel_data.get('website_address')
|
|||
|
|
owner_inn = hotel_data.get('owner_inn')
|
|||
|
|
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"🏨 «{hotel_name}»")
|
|||
|
|
logger.info(f"🌐 {website_url or 'Нет сайта'}")
|
|||
|
|
if owner_inn:
|
|||
|
|
logger.info(f"🔢 ИНН: {owner_inn}")
|
|||
|
|
logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
if not website_url or website_url in ['-', 'Нет сайта', '']:
|
|||
|
|
logger.info(" ⏭️ Пропуск - нет сайта")
|
|||
|
|
return {'success': False, 'reason': 'no_website'}
|
|||
|
|
|
|||
|
|
# Нормализуем URL
|
|||
|
|
if not website_url.startswith(('http://', 'https://')):
|
|||
|
|
website_url = 'https://' + website_url
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(headless=True)
|
|||
|
|
page = await browser.new_page()
|
|||
|
|
|
|||
|
|
# Настройки страницы
|
|||
|
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|||
|
|
await page.set_extra_http_headers({
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# Краулинг главной страницы
|
|||
|
|
result = await self.crawl_page(page, website_url, hotel_id, depth=0)
|
|||
|
|
|
|||
|
|
# Проверяем в реестре РКН (если есть ИНН)
|
|||
|
|
if owner_inn and result['success']:
|
|||
|
|
rkn_result = await self.check_rkn_registry(owner_inn, browser)
|
|||
|
|
self.save_rkn_result(hotel_id, rkn_result)
|
|||
|
|
|
|||
|
|
if not result['success']:
|
|||
|
|
await browser.close()
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
# Краулинг внутренних страниц
|
|||
|
|
internal_links = result.get('internal_links', [])
|
|||
|
|
pages_crawled = 1
|
|||
|
|
|
|||
|
|
for link in internal_links[:MAX_PAGES_PER_SITE]:
|
|||
|
|
if link not in self.visited_urls:
|
|||
|
|
self.visited_urls.add(link)
|
|||
|
|
await self.crawl_page(page, link, hotel_id, depth=1)
|
|||
|
|
pages_crawled += 1
|
|||
|
|
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
logger.info(f"✓ Спарсено {pages_crawled} страниц")
|
|||
|
|
return {'success': True, 'pages_crawled': pages_crawled}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"✗ Ошибка краулинга: {e}")
|
|||
|
|
return {'success': False, 'error': str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def main():
|
|||
|
|
"""Основная функция"""
|
|||
|
|
logger.info("")
|
|||
|
|
logger.info("="*70)
|
|||
|
|
logger.info("🚀 ЗАПУСК КРАУЛИНГА КАМЧАТСКИХ ОТЕЛЕЙ С СОХРАНЕНИЕМ В POSTGRESQL")
|
|||
|
|
logger.info("="*70)
|
|||
|
|
|
|||
|
|
crawler = WebsiteCrawler()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Подключаемся к БД
|
|||
|
|
await crawler.connect_db()
|
|||
|
|
|
|||
|
|
# Получаем камчатские отели с сайтами
|
|||
|
|
cur = crawler.db_conn.cursor()
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT id, full_name, website_address, owner_inn
|
|||
|
|
FROM hotel_main
|
|||
|
|
WHERE region_name ILIKE '%камчат%'
|
|||
|
|
AND website_address IS NOT NULL
|
|||
|
|
AND website_address != '-'
|
|||
|
|
AND website_address != ''
|
|||
|
|
ORDER BY full_name
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
hotels = [{'id': row[0], 'full_name': row[1], 'website_address': row[2], 'owner_inn': row[3]} for row in cur.fetchall()]
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
# Добавляем колонки для РКН (если их нет)
|
|||
|
|
cur = crawler.db_conn.cursor()
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_status VARCHAR(50);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_number VARCHAR(50);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_registry_date VARCHAR(20);')
|
|||
|
|
cur.execute('ALTER TABLE hotel_main ADD COLUMN IF NOT EXISTS rkn_checked_at TIMESTAMP;')
|
|||
|
|
crawler.db_conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
logger.info(f"📊 Отелей: {len(hotels)}")
|
|||
|
|
logger.info(f"💾 Таблицы: hotel_website_raw, hotel_website_meta")
|
|||
|
|
logger.info("="*70)
|
|||
|
|
|
|||
|
|
# Краулинг отелей
|
|||
|
|
successful = 0
|
|||
|
|
failed = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
logger.info(f"\n[{i}/{len(hotels)}] {'='*35}")
|
|||
|
|
|
|||
|
|
result = await crawler.crawl_hotel(hotel)
|
|||
|
|
|
|||
|
|
if result['success']:
|
|||
|
|
successful += 1
|
|||
|
|
else:
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
# Итоги
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info("📊 ИТОГИ:")
|
|||
|
|
logger.info(f" ✅ Успешно: {successful}/{len(hotels)}")
|
|||
|
|
logger.info(f" ✗ Ошибки: {failed}/{len(hotels)}")
|
|||
|
|
logger.info("="*70)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"❌ Критическая ошибка: {e}")
|
|||
|
|
finally:
|
|||
|
|
crawler.close_db()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|