Files
hotels/single_hotel_crawler.py

185 lines
6.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Краулинг одного конкретного отеля
"""
import asyncio
import psycopg2
from psycopg2.extras import Json, RealDictCursor
from urllib.parse import unquote, urlparse
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import re
import logging
from datetime import datetime
import sys
# Конфигурация БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
# Настройки краулинга
MAX_PAGES_PER_SITE = 10
PAGE_TIMEOUT = 30000
# Логирование
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class TextCleaner:
"""Очистка HTML"""
@classmethod
def clean_text(cls, html: str) -> str:
"""Очистка HTML до чистого текста"""
if not html:
return ""
soup = BeautifulSoup(html, 'html.parser')
# Удаляем скрипты и стили
for script in soup(["script", "style"]):
script.decompose()
# Получаем текст
text = soup.get_text()
# Очищаем от лишних пробелов и переносов
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
async def crawl_hotel(hotel_id: str):
"""Краулинг одного отеля"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
try:
# Получаем данные отеля
cur.execute("""
SELECT id, full_name, website_address, region_name
FROM hotel_main
WHERE id = %s
""", (hotel_id,))
hotel = cur.fetchone()
if not hotel:
print(f"❌ Отель с ID {hotel_id} не найден")
return
print(f"🏨 Краулим: {hotel['full_name']}")
print(f"🔗 URL: {hotel['website_address']}")
print(f"📍 Регион: {hotel['region_name']}")
url = hotel['website_address']
if not url:
print("У отеля нет URL")
return
# Добавляем протокол если нет
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
print(f"🌐 Полный URL: {url}")
# Запускаем браузер
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Переходим на главную страницу
print("📄 Загружаем главную страницу...")
await page.goto(url, timeout=PAGE_TIMEOUT)
# Получаем HTML
html = await page.content()
cleaned_text = TextCleaner.clean_text(html)
print(f"✅ Получено {len(html)} символов HTML")
print(f"📝 Очищено до {len(cleaned_text)} символов текста")
# Удаляем старую запись если есть
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
# Сохраняем в hotel_website_raw
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, url, html, datetime.now()))
# Обновляем метаданные
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
# Обновляем статус отеля
cur.execute("""
UPDATE hotel_main
SET website_status = 'accessible'
WHERE id = %s
""", (hotel_id,))
conn.commit()
print("✅ Краулинг завершен успешно!")
except Exception as e:
print(f"❌ Ошибка краулинга: {e}")
# Сохраняем ошибку
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (hotel_id, 'failed', str(e), datetime.now(), datetime.now()))
conn.commit()
finally:
await browser.close()
finally:
cur.close()
conn.close()
def main():
if len(sys.argv) != 2:
print("Использование: python3 single_hotel_crawler.py <hotel_id>")
sys.exit(1)
hotel_id = sys.argv[1]
print(f"🚀 Запуск краулинга для отеля: {hotel_id}")
asyncio.run(crawl_hotel(hotel_id))
if __name__ == "__main__":
main()