274 lines
9.6 KiB
Python
274 lines
9.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Региональный краулер для массового сбора сайтов отелей
|
|||
|
|
Параллельная версия с поддержкой указания региона
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import logging
|
|||
|
|
from typing import List, Dict, Optional
|
|||
|
|
from datetime import datetime
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from playwright.async_api import async_playwright, Page
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
def setup_logging(region_name: str):
|
|||
|
|
log_filename = f"crawler_{region_name.replace(' ', '_').replace('.', '')}.log"
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(log_filename, encoding='utf-8'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
return logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
def get_hotels_to_crawl(region_name: str, limit: int = None) -> List[Dict]:
|
|||
|
|
"""Получить необработанные отели конкретного региона"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
query = """
|
|||
|
|
SELECT
|
|||
|
|
h.id,
|
|||
|
|
h.full_name,
|
|||
|
|
h.region_name,
|
|||
|
|
h.website_address,
|
|||
|
|
hwm.error_message
|
|||
|
|
FROM hotel_main h
|
|||
|
|
LEFT JOIN hotel_website_raw hwr ON hwr.hotel_id = h.id
|
|||
|
|
LEFT JOIN hotel_website_meta hwm ON hwm.hotel_id = h.id
|
|||
|
|
WHERE h.website_address IS NOT NULL
|
|||
|
|
AND h.website_address != ''
|
|||
|
|
AND h.region_name = %s
|
|||
|
|
AND hwr.hotel_id IS NULL
|
|||
|
|
AND (hwm.error_message IS NULL OR hwm.error_message = '')
|
|||
|
|
ORDER BY h.full_name
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
if limit:
|
|||
|
|
query += f" LIMIT {limit}"
|
|||
|
|
|
|||
|
|
cur.execute(query, (region_name,))
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return hotels
|
|||
|
|
|
|||
|
|
def mark_hotel_failed(hotel_id: str, error_message: str):
|
|||
|
|
"""Помечает отель как проблемный"""
|
|||
|
|
try:
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, error_message, updated_at)
|
|||
|
|
VALUES (%s, %s, NOW())
|
|||
|
|
ON CONFLICT (hotel_id)
|
|||
|
|
DO UPDATE SET
|
|||
|
|
error_message = EXCLUDED.error_message,
|
|||
|
|
updated_at = NOW()
|
|||
|
|
""", (hotel_id, error_message))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
except Exception as e:
|
|||
|
|
logging.error(f"Ошибка пометки отеля как failed: {e}")
|
|||
|
|
|
|||
|
|
def save_to_db(hotel_id: str, website_url: str, pages_data: List[Dict]):
|
|||
|
|
"""Сохранение в PostgreSQL"""
|
|||
|
|
try:
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Сохраняем каждую страницу в hotel_website_raw
|
|||
|
|
for page in pages_data:
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, created_at)
|
|||
|
|
VALUES (%s, %s, %s, NOW())
|
|||
|
|
""", (hotel_id, page['url'], page['html']))
|
|||
|
|
|
|||
|
|
# Обновляем метаданные
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, pages_crawled, updated_at)
|
|||
|
|
VALUES (%s, %s, NOW())
|
|||
|
|
ON CONFLICT (hotel_id)
|
|||
|
|
DO UPDATE SET
|
|||
|
|
pages_crawled = EXCLUDED.pages_crawled,
|
|||
|
|
error_message = NULL,
|
|||
|
|
updated_at = NOW()
|
|||
|
|
""", (hotel_id, len(pages_data)))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logging.error(f"Ошибка сохранения в БД: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
async def crawl_hotel(hotel: Dict, logger) -> bool:
|
|||
|
|
"""Краулинг одного отеля"""
|
|||
|
|
hotel_id = str(hotel['id'])
|
|||
|
|
website = hotel['website_address'].strip()
|
|||
|
|
hotel_name = hotel['full_name']
|
|||
|
|
region = hotel['region_name']
|
|||
|
|
|
|||
|
|
logger.info(f"🏨 {hotel_name} ({region})")
|
|||
|
|
logger.info(f" URL: {website}")
|
|||
|
|
|
|||
|
|
# Нормализация URL
|
|||
|
|
if not website.startswith('http'):
|
|||
|
|
website = f"https://{website}"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(headless=True)
|
|||
|
|
context = await browser.new_context(
|
|||
|
|
viewport={'width': 1920, 'height': 1080},
|
|||
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
page = await context.new_page()
|
|||
|
|
|
|||
|
|
# Загружаем главную страницу
|
|||
|
|
try:
|
|||
|
|
await page.goto(website, wait_until='domcontentloaded', timeout=30000)
|
|||
|
|
await page.wait_for_timeout(2000)
|
|||
|
|
except Exception as e:
|
|||
|
|
error_msg = str(e)[:200]
|
|||
|
|
logger.warning(f" ❌ Ошибка загрузки: {error_msg}")
|
|||
|
|
mark_hotel_failed(hotel_id, error_msg)
|
|||
|
|
await browser.close()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Проверяем статус
|
|||
|
|
if page.url.startswith('https://www.reg.ru/domain/') or 'Домен припаркован' in await page.content():
|
|||
|
|
logger.warning(f" ⚠️ Домен припаркован")
|
|||
|
|
mark_hotel_failed(hotel_id, "Domain parked")
|
|||
|
|
await browser.close()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Собираем главную страницу
|
|||
|
|
main_html = await page.content()
|
|||
|
|
main_text_length = len(await page.inner_text('body'))
|
|||
|
|
logger.info(f" ✅ Главная: {main_text_length} символов")
|
|||
|
|
|
|||
|
|
pages_data = [{
|
|||
|
|
'url': page.url,
|
|||
|
|
'html': main_html
|
|||
|
|
}]
|
|||
|
|
|
|||
|
|
# Собираем внутренние ссылки
|
|||
|
|
internal_links = await page.evaluate("""
|
|||
|
|
() => {
|
|||
|
|
const links = Array.from(document.querySelectorAll('a[href]'));
|
|||
|
|
const baseUrl = window.location.origin;
|
|||
|
|
return [...new Set(
|
|||
|
|
links
|
|||
|
|
.map(a => a.href)
|
|||
|
|
.filter(href => href.startsWith(baseUrl) && !href.includes('#'))
|
|||
|
|
)].slice(0, 14);
|
|||
|
|
}
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
logger.info(f" 📄 Найдено {len(internal_links)} внутренних ссылок")
|
|||
|
|
|
|||
|
|
# Обходим внутренние страницы
|
|||
|
|
for link in internal_links[:14]:
|
|||
|
|
try:
|
|||
|
|
await page.goto(link, wait_until='domcontentloaded', timeout=15000)
|
|||
|
|
await page.wait_for_timeout(500)
|
|||
|
|
|
|||
|
|
link_html = await page.content()
|
|||
|
|
pages_data.append({
|
|||
|
|
'url': page.url,
|
|||
|
|
'html': link_html
|
|||
|
|
})
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
# Сохраняем в БД
|
|||
|
|
if save_to_db(hotel_id, website, pages_data):
|
|||
|
|
logger.info(f" 💾 Сохранено {len(pages_data)} страниц")
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error(f" ❌ Ошибка сохранения в БД")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ❌ Критическая ошибка: {e}")
|
|||
|
|
mark_hotel_failed(hotel_id, str(e)[:200])
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
async def main(region_name: str):
|
|||
|
|
"""Главная функция"""
|
|||
|
|
logger = setup_logging(region_name)
|
|||
|
|
|
|||
|
|
logger.info(f"🚀 ЗАПУСК РЕГИОНАЛЬНОГО КРАУЛЕРА")
|
|||
|
|
logger.info(f"🌍 Регион: {region_name}")
|
|||
|
|
logger.info("="*60)
|
|||
|
|
|
|||
|
|
# Получаем отели для обработки
|
|||
|
|
hotels = get_hotels_to_crawl(region_name)
|
|||
|
|
|
|||
|
|
if not hotels:
|
|||
|
|
logger.info(f"✅ Все отели региона {region_name} уже обработаны!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
logger.info(f"📊 Найдено отелей для обработки: {len(hotels)}\n")
|
|||
|
|
|
|||
|
|
# Обрабатываем отели последовательно
|
|||
|
|
success_count = 0
|
|||
|
|
error_count = 0
|
|||
|
|
|
|||
|
|
for idx, hotel in enumerate(hotels, 1):
|
|||
|
|
logger.info(f"\n[{idx}/{len(hotels)}] " + "="*50)
|
|||
|
|
|
|||
|
|
result = await crawl_hotel(hotel, logger)
|
|||
|
|
|
|||
|
|
if result:
|
|||
|
|
success_count += 1
|
|||
|
|
else:
|
|||
|
|
error_count += 1
|
|||
|
|
|
|||
|
|
# Небольшая пауза между отелями
|
|||
|
|
await asyncio.sleep(1)
|
|||
|
|
|
|||
|
|
# Финальная статистика
|
|||
|
|
logger.info("\n" + "="*60)
|
|||
|
|
logger.info(f"✅ КРАУЛИНГ ЗАВЕРШЁН")
|
|||
|
|
logger.info(f"📊 Обработано: {success_count}/{len(hotels)}")
|
|||
|
|
logger.info(f"❌ Ошибок: {error_count}")
|
|||
|
|
logger.info("="*60)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print("Использование: python3 regional_crawler.py 'Название региона'")
|
|||
|
|
print("Пример: python3 regional_crawler.py 'г. Москва'")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
region_name = sys.argv[1]
|
|||
|
|
asyncio.run(main(region_name))
|
|||
|
|
|