185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Краулинг одного конкретного отеля
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import Json, RealDictCursor
|
|||
|
|
from urllib.parse import unquote, urlparse
|
|||
|
|
from playwright.async_api import async_playwright
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import re
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Настройки краулинга
|
|||
|
|
MAX_PAGES_PER_SITE = 10
|
|||
|
|
PAGE_TIMEOUT = 30000
|
|||
|
|
|
|||
|
|
# Логирование
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextCleaner:
|
|||
|
|
"""Очистка HTML"""
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def clean_text(cls, html: str) -> str:
|
|||
|
|
"""Очистка HTML до чистого текста"""
|
|||
|
|
if not html:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты и стили
|
|||
|
|
for script in soup(["script", "style"]):
|
|||
|
|
script.decompose()
|
|||
|
|
|
|||
|
|
# Получаем текст
|
|||
|
|
text = soup.get_text()
|
|||
|
|
|
|||
|
|
# Очищаем от лишних пробелов и переносов
|
|||
|
|
lines = (line.strip() for line in text.splitlines())
|
|||
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|||
|
|
text = ' '.join(chunk for chunk in chunks if chunk)
|
|||
|
|
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def crawl_hotel(hotel_id: str):
|
|||
|
|
"""Краулинг одного отеля"""
|
|||
|
|
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Получаем данные отеля
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT id, full_name, website_address, region_name
|
|||
|
|
FROM hotel_main
|
|||
|
|
WHERE id = %s
|
|||
|
|
""", (hotel_id,))
|
|||
|
|
|
|||
|
|
hotel = cur.fetchone()
|
|||
|
|
if not hotel:
|
|||
|
|
print(f"❌ Отель с ID {hotel_id} не найден")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"🏨 Краулим: {hotel['full_name']}")
|
|||
|
|
print(f"🔗 URL: {hotel['website_address']}")
|
|||
|
|
print(f"📍 Регион: {hotel['region_name']}")
|
|||
|
|
|
|||
|
|
url = hotel['website_address']
|
|||
|
|
if not url:
|
|||
|
|
print("❌ У отеля нет URL")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Добавляем протокол если нет
|
|||
|
|
if not url.startswith(('http://', 'https://')):
|
|||
|
|
url = 'https://' + url
|
|||
|
|
|
|||
|
|
print(f"🌐 Полный URL: {url}")
|
|||
|
|
|
|||
|
|
# Запускаем браузер
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(headless=True)
|
|||
|
|
context = await browser.new_context()
|
|||
|
|
page = await context.new_page()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Переходим на главную страницу
|
|||
|
|
print("📄 Загружаем главную страницу...")
|
|||
|
|
await page.goto(url, timeout=PAGE_TIMEOUT)
|
|||
|
|
|
|||
|
|
# Получаем HTML
|
|||
|
|
html = await page.content()
|
|||
|
|
cleaned_text = TextCleaner.clean_text(html)
|
|||
|
|
|
|||
|
|
print(f"✅ Получено {len(html)} символов HTML")
|
|||
|
|
print(f"📝 Очищено до {len(cleaned_text)} символов текста")
|
|||
|
|
|
|||
|
|
# Удаляем старую запись если есть
|
|||
|
|
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
|
|||
|
|
# Сохраняем в hotel_website_raw
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
|||
|
|
VALUES (%s, %s, %s, %s)
|
|||
|
|
""", (hotel_id, url, html, datetime.now()))
|
|||
|
|
|
|||
|
|
# Обновляем метаданные
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
pages_crawled = EXCLUDED.pages_crawled,
|
|||
|
|
total_size_bytes = EXCLUDED.total_size_bytes,
|
|||
|
|
crawl_started_at = EXCLUDED.crawl_started_at,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
|||
|
|
error_message = NULL
|
|||
|
|
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
|||
|
|
|
|||
|
|
# Обновляем статус отеля
|
|||
|
|
cur.execute("""
|
|||
|
|
UPDATE hotel_main
|
|||
|
|
SET website_status = 'accessible'
|
|||
|
|
WHERE id = %s
|
|||
|
|
""", (hotel_id,))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
print("✅ Краулинг завершен успешно!")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Ошибка краулинга: {e}")
|
|||
|
|
|
|||
|
|
# Сохраняем ошибку
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s)
|
|||
|
|
ON CONFLICT (hotel_id) DO UPDATE SET
|
|||
|
|
crawl_status = EXCLUDED.crawl_status,
|
|||
|
|
error_message = EXCLUDED.error_message,
|
|||
|
|
crawl_started_at = EXCLUDED.crawl_started_at,
|
|||
|
|
crawl_finished_at = EXCLUDED.crawl_finished_at
|
|||
|
|
""", (hotel_id, 'failed', str(e), datetime.now(), datetime.now()))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
if len(sys.argv) != 2:
|
|||
|
|
print("Использование: python3 single_hotel_crawler.py <hotel_id>")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
hotel_id = sys.argv[1]
|
|||
|
|
print(f"🚀 Запуск краулинга для отеля: {hotel_id}")
|
|||
|
|
|
|||
|
|
asyncio.run(crawl_hotel(hotel_id))
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|