Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
184
single_hotel_crawler.py
Normal file
184
single_hotel_crawler.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Краулинг одного конкретного отеля
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import psycopg2
|
||||
from psycopg2.extras import Json, RealDictCursor
|
||||
from urllib.parse import unquote, urlparse
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
# Конфигурация БД
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
# Настройки краулинга
|
||||
MAX_PAGES_PER_SITE = 10
|
||||
PAGE_TIMEOUT = 30000
|
||||
|
||||
# Логирование
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
"""Очистка HTML"""
|
||||
|
||||
@classmethod
|
||||
def clean_text(cls, html: str) -> str:
|
||||
"""Очистка HTML до чистого текста"""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Удаляем скрипты и стили
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# Получаем текст
|
||||
text = soup.get_text()
|
||||
|
||||
# Очищаем от лишних пробелов и переносов
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = ' '.join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
async def crawl_hotel(hotel_id: str):
|
||||
"""Краулинг одного отеля"""
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
# Получаем данные отеля
|
||||
cur.execute("""
|
||||
SELECT id, full_name, website_address, region_name
|
||||
FROM hotel_main
|
||||
WHERE id = %s
|
||||
""", (hotel_id,))
|
||||
|
||||
hotel = cur.fetchone()
|
||||
if not hotel:
|
||||
print(f"❌ Отель с ID {hotel_id} не найден")
|
||||
return
|
||||
|
||||
print(f"🏨 Краулим: {hotel['full_name']}")
|
||||
print(f"🔗 URL: {hotel['website_address']}")
|
||||
print(f"📍 Регион: {hotel['region_name']}")
|
||||
|
||||
url = hotel['website_address']
|
||||
if not url:
|
||||
print("❌ У отеля нет URL")
|
||||
return
|
||||
|
||||
# Добавляем протокол если нет
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
|
||||
print(f"🌐 Полный URL: {url}")
|
||||
|
||||
# Запускаем браузер
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
# Переходим на главную страницу
|
||||
print("📄 Загружаем главную страницу...")
|
||||
await page.goto(url, timeout=PAGE_TIMEOUT)
|
||||
|
||||
# Получаем HTML
|
||||
html = await page.content()
|
||||
cleaned_text = TextCleaner.clean_text(html)
|
||||
|
||||
print(f"✅ Получено {len(html)} символов HTML")
|
||||
print(f"📝 Очищено до {len(cleaned_text)} символов текста")
|
||||
|
||||
# Удаляем старую запись если есть
|
||||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||||
|
||||
# Сохраняем в hotel_website_raw
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
""", (hotel_id, url, html, datetime.now()))
|
||||
|
||||
# Обновляем метаданные
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
pages_crawled = EXCLUDED.pages_crawled,
|
||||
total_size_bytes = EXCLUDED.total_size_bytes,
|
||||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at,
|
||||
error_message = NULL
|
||||
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
|
||||
|
||||
# Обновляем статус отеля
|
||||
cur.execute("""
|
||||
UPDATE hotel_main
|
||||
SET website_status = 'accessible'
|
||||
WHERE id = %s
|
||||
""", (hotel_id,))
|
||||
|
||||
conn.commit()
|
||||
print("✅ Краулинг завершен успешно!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка краулинга: {e}")
|
||||
|
||||
# Сохраняем ошибку
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
error_message = EXCLUDED.error_message,
|
||||
crawl_started_at = EXCLUDED.crawl_started_at,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||||
""", (hotel_id, 'failed', str(e), datetime.now(), datetime.now()))
|
||||
|
||||
conn.commit()
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
print("Использование: python3 single_hotel_crawler.py <hotel_id>")
|
||||
sys.exit(1)
|
||||
|
||||
hotel_id = sys.argv[1]
|
||||
print(f"🚀 Запуск краулинга для отеля: {hotel_id}")
|
||||
|
||||
asyncio.run(crawl_hotel(hotel_id))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user