- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Краулинг одного конкретного отеля
|
||
"""
|
||
|
||
import asyncio
|
||
import psycopg2
|
||
from psycopg2.extras import Json, RealDictCursor
|
||
from urllib.parse import unquote, urlparse
|
||
from playwright.async_api import async_playwright
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import logging
|
||
from datetime import datetime
|
||
import sys
|
||
|
||
# Конфигурация БД
|
||
DB_CONFIG = {
|
||
'host': "147.45.189.234",
|
||
'port': 5432,
|
||
'database': "default_db",
|
||
'user': "gen_user",
|
||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||
}
|
||
|
||
# Настройки краулинга
|
||
MAX_PAGES_PER_SITE = 10
|
||
PAGE_TIMEOUT = 30000
|
||
|
||
# Логирование
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class TextCleaner:
|
||
"""Очистка HTML"""
|
||
|
||
@classmethod
|
||
def clean_text(cls, html: str) -> str:
|
||
"""Очистка HTML до чистого текста"""
|
||
if not html:
|
||
return ""
|
||
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# Удаляем скрипты и стили
|
||
for script in soup(["script", "style"]):
|
||
script.decompose()
|
||
|
||
# Получаем текст
|
||
text = soup.get_text()
|
||
|
||
# Очищаем от лишних пробелов и переносов
|
||
lines = (line.strip() for line in text.splitlines())
|
||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||
text = ' '.join(chunk for chunk in chunks if chunk)
|
||
|
||
return text
|
||
|
||
|
||
async def crawl_hotel(hotel_id: str):
|
||
"""Краулинг одного отеля"""
|
||
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
try:
|
||
# Получаем данные отеля
|
||
cur.execute("""
|
||
SELECT id, full_name, website_address, region_name
|
||
FROM hotel_main
|
||
WHERE id = %s
|
||
""", (hotel_id,))
|
||
|
||
hotel = cur.fetchone()
|
||
if not hotel:
|
||
print(f"❌ Отель с ID {hotel_id} не найден")
|
||
return
|
||
|
||
print(f"🏨 Краулим: {hotel['full_name']}")
|
||
print(f"🔗 URL: {hotel['website_address']}")
|
||
print(f"📍 Регион: {hotel['region_name']}")
|
||
|
||
url = hotel['website_address']
|
||
if not url:
|
||
print("❌ У отеля нет URL")
|
||
return
|
||
|
||
# Добавляем протокол если нет
|
||
if not url.startswith(('http://', 'https://')):
|
||
url = 'https://' + url
|
||
|
||
print(f"🌐 Полный URL: {url}")
|
||
|
||
# Запускаем браузер
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=True)
|
||
context = await browser.new_context()
|
||
page = await context.new_page()
|
||
|
||
try:
|
||
# Переходим на главную страницу
|
||
print("📄 Загружаем главную страницу...")
|
||
await page.goto(url, timeout=PAGE_TIMEOUT)
|
||
|
||
# Получаем HTML
|
||
html = await page.content()
|
||
cleaned_text = TextCleaner.clean_text(html)
|
||
|
||
print(f"✅ Получено {len(html)} символов HTML")
|
||
print(f"📝 Очищено до {len(cleaned_text)} символов текста")
|
||
|
||
# Удаляем старую запись если есть
|
||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||
|
||
# Сохраняем в hotel_website_raw
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||
VALUES (%s, %s, %s, %s)
|
||
""", (hotel_id, url, html, datetime.now()))
|
||
|
||
# Обновляем метаданные
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||
VALUES (%s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
crawl_status = EXCLUDED.crawl_status,
|
||
pages_crawled = EXCLUDED.pages_crawled,
|
||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||
error_message = NULL
|
||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||
|
||
# Обновляем статус отеля
|
||
cur.execute("""
|
||
UPDATE hotel_main
|
||
SET website_status = 'accessible'
|
||
WHERE id = %s
|
||
""", (hotel_id,))
|
||
|
||
conn.commit()
|
||
print("✅ Краулинг завершен успешно!")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Ошибка краулинга: {e}")
|
||
|
||
# Сохраняем ошибку
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||
crawl_status = EXCLUDED.crawl_status,
|
||
error_message = EXCLUDED.error_message,
|
||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||
""", (hotel_id, 'failed', str(e), datetime.now(), datetime.now()))
|
||
|
||
conn.commit()
|
||
|
||
finally:
|
||
await browser.close()
|
||
|
||
finally:
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) != 2:
|
||
print("Использование: python3 single_hotel_crawler.py <hotel_id>")
|
||
sys.exit(1)
|
||
|
||
hotel_id = sys.argv[1]
|
||
print(f"🚀 Запуск краулинга для отеля: {hotel_id}")
|
||
|
||
asyncio.run(crawl_hotel(hotel_id))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|