Files
hotels/single_hotel_crawler.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

185 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Краулинг одного конкретного отеля
"""
import asyncio
import psycopg2
from psycopg2.extras import Json, RealDictCursor
from urllib.parse import unquote, urlparse
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import re
import logging
from datetime import datetime
import sys
# Конфигурация БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
# Настройки краулинга
MAX_PAGES_PER_SITE = 10
PAGE_TIMEOUT = 30000
# Логирование
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class TextCleaner:
"""Очистка HTML"""
@classmethod
def clean_text(cls, html: str) -> str:
"""Очистка HTML до чистого текста"""
if not html:
return ""
soup = BeautifulSoup(html, 'html.parser')
# Удаляем скрипты и стили
for script in soup(["script", "style"]):
script.decompose()
# Получаем текст
text = soup.get_text()
# Очищаем от лишних пробелов и переносов
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
async def crawl_hotel(hotel_id: str):
"""Краулинг одного отеля"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
try:
# Получаем данные отеля
cur.execute("""
SELECT id, full_name, website_address, region_name
FROM hotel_main
WHERE id = %s
""", (hotel_id,))
hotel = cur.fetchone()
if not hotel:
print(f"❌ Отель с ID {hotel_id} не найден")
return
print(f"🏨 Краулим: {hotel['full_name']}")
print(f"🔗 URL: {hotel['website_address']}")
print(f"📍 Регион: {hotel['region_name']}")
url = hotel['website_address']
if not url:
print("У отеля нет URL")
return
# Добавляем протокол если нет
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
print(f"🌐 Полный URL: {url}")
# Запускаем браузер
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Переходим на главную страницу
print("📄 Загружаем главную страницу...")
await page.goto(url, timeout=PAGE_TIMEOUT)
# Получаем HTML
html = await page.content()
cleaned_text = TextCleaner.clean_text(html)
print(f"✅ Получено {len(html)} символов HTML")
print(f"📝 Очищено до {len(cleaned_text)} символов текста")
# Удаляем старую запись если есть
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
# Сохраняем в hotel_website_raw
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, url, html, datetime.now()))
# Обновляем метаданные
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
# Обновляем статус отеля
cur.execute("""
UPDATE hotel_main
SET website_status = 'accessible'
WHERE id = %s
""", (hotel_id,))
conn.commit()
print("✅ Краулинг завершен успешно!")
except Exception as e:
print(f"❌ Ошибка краулинга: {e}")
# Сохраняем ошибку
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (hotel_id, 'failed', str(e), datetime.now(), datetime.now()))
conn.commit()
finally:
await browser.close()
finally:
cur.close()
conn.close()
def main():
if len(sys.argv) != 2:
print("Использование: python3 single_hotel_crawler.py <hotel_id>")
sys.exit(1)
hotel_id = sys.argv[1]
print(f"🚀 Запуск краулинга для отеля: {hotel_id}")
asyncio.run(crawl_hotel(hotel_id))
if __name__ == "__main__":
main()