Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
319
mass_crawler.py
Executable file
319
mass_crawler.py
Executable file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Массовый краулинг всех отелей в фоновом режиме
|
||||
Обрабатывает все отели с сайтами, которые ещё не скраулены
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import psycopg2
|
||||
from psycopg2.extras import Json
|
||||
from urllib.parse import unquote, urlparse
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Set, List, Dict
|
||||
import sys
|
||||
|
||||
# Конфигурация БД
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
# Настройки краулинга
|
||||
MAX_PAGES_PER_SITE = 20 # Максимум страниц с одного сайта
|
||||
PAGE_TIMEOUT = 20000 # Уменьшено с 30 до 20 секунд
|
||||
BATCH_SIZE = 50 # Обрабатывать пачками по 50 отелей
|
||||
MAX_CONCURRENT = 5 # Увеличено с 3 до 5 браузеров одновременно
|
||||
|
||||
# Логирование
|
||||
log_filename = f'mass_crawler_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
"""Очистка HTML"""
|
||||
|
||||
@classmethod
|
||||
def clean_html(cls, html: str) -> str:
|
||||
"""Простая очистка HTML"""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Удаляем скрипты и стили
|
||||
for tag in soup.find_all(['script', 'style', 'noscript']):
|
||||
tag.decompose()
|
||||
|
||||
# Получаем чистый текст
|
||||
text = soup.get_text()
|
||||
|
||||
# Очистка текста
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\n\s*\n', '\n', text)
|
||||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
async def crawl_hotel(hotel: Dict, semaphore: asyncio.Semaphore, browser):
|
||||
"""Краулинг одного отеля"""
|
||||
async with semaphore:
|
||||
hotel_id = hotel['id']
|
||||
hotel_name = hotel['full_name']
|
||||
website = hotel['website_address']
|
||||
region = hotel['region_name']
|
||||
|
||||
logger.info(f"🏨 Начинаю краулинг: {hotel_name} ({region})")
|
||||
logger.info(f" URL: {website}")
|
||||
|
||||
try:
|
||||
# Нормализация URL
|
||||
if not website.startswith(('http://', 'https://')):
|
||||
website = 'https://' + website
|
||||
|
||||
context = await browser.new_context(
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
visited_urls = set()
|
||||
pages_data = []
|
||||
|
||||
# Главная страница
|
||||
try:
|
||||
response = await page.goto(website, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
||||
|
||||
if response and response.ok:
|
||||
await page.wait_for_timeout(2000) # Ждём JS
|
||||
|
||||
html = await page.content()
|
||||
cleaned_text = TextCleaner.clean_html(html)
|
||||
|
||||
pages_data.append({
|
||||
'url': page.url,
|
||||
'html': html,
|
||||
'text': cleaned_text,
|
||||
'status': response.status
|
||||
})
|
||||
visited_urls.add(page.url)
|
||||
|
||||
logger.info(f" ✅ Главная: {len(cleaned_text)} символов")
|
||||
|
||||
# Собираем ссылки
|
||||
links = await page.eval_on_selector_all(
|
||||
'a[href]',
|
||||
'''elements => elements.map(e => e.href).filter(h => h && h.startsWith('http'))'''
|
||||
)
|
||||
|
||||
# Фильтруем внутренние ссылки
|
||||
base_domain = urlparse(website).netloc
|
||||
internal_links = [
|
||||
link for link in links
|
||||
if urlparse(link).netloc == base_domain and link not in visited_urls
|
||||
][:MAX_PAGES_PER_SITE - 1]
|
||||
|
||||
logger.info(f" 📄 Найдено {len(internal_links)} внутренних ссылок")
|
||||
|
||||
# Обходим внутренние страницы
|
||||
for link in internal_links:
|
||||
if len(pages_data) >= MAX_PAGES_PER_SITE:
|
||||
break
|
||||
|
||||
try:
|
||||
response = await page.goto(link, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
||||
|
||||
if response and response.ok:
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
html = await page.content()
|
||||
cleaned_text = TextCleaner.clean_html(html)
|
||||
|
||||
pages_data.append({
|
||||
'url': page.url,
|
||||
'html': html,
|
||||
'text': cleaned_text,
|
||||
'status': response.status
|
||||
})
|
||||
visited_urls.add(page.url)
|
||||
|
||||
logger.info(f" ✅ Страница {len(pages_data)}: {len(cleaned_text)} символов")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" ⚠️ Ошибка страницы {link}: {e}")
|
||||
continue
|
||||
|
||||
else:
|
||||
logger.warning(f" ⚠️ Главная страница недоступна: {response.status if response else 'No response'}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ❌ Ошибка загрузки главной: {e}")
|
||||
|
||||
await context.close()
|
||||
|
||||
# Сохраняем в БД
|
||||
if pages_data:
|
||||
save_to_db(hotel_id, hotel_name, region, website, pages_data)
|
||||
logger.info(f" 💾 Сохранено {len(pages_data)} страниц для {hotel_name}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f" ⚠️ Нет данных для {hotel_name}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ❌ Критическая ошибка для {hotel_name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def save_to_db(hotel_id: str, hotel_name: str, region: str, website: str, pages_data: List[Dict]):
|
||||
"""Сохранение в PostgreSQL"""
|
||||
try:
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Сохраняем метаданные
|
||||
from urllib.parse import urlparse
|
||||
domain = urlparse(website).netloc
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_meta (hotel_id, domain, main_url, pages_crawled, crawl_status, crawl_finished_at)
|
||||
VALUES (%s, %s, %s, %s, %s, NOW())
|
||||
ON CONFLICT (hotel_id) DO UPDATE SET
|
||||
pages_crawled = EXCLUDED.pages_crawled,
|
||||
crawl_status = EXCLUDED.crawl_status,
|
||||
crawl_finished_at = EXCLUDED.crawl_finished_at
|
||||
""", (hotel_id, domain, website, len(pages_data), 'completed'))
|
||||
|
||||
# Сохраняем сырой HTML
|
||||
for page in pages_data:
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_raw (hotel_id, url, html, status_code, crawled_at)
|
||||
VALUES (%s, %s, %s, %s, NOW())
|
||||
ON CONFLICT ON CONSTRAINT hotel_website_raw_hotel_id_url_key DO UPDATE SET
|
||||
html = EXCLUDED.html,
|
||||
status_code = EXCLUDED.status_code,
|
||||
crawled_at = EXCLUDED.crawled_at
|
||||
""", (hotel_id, page['url'], page['html'], page['status']))
|
||||
|
||||
# Сохраняем очищенный текст
|
||||
for page in pages_data:
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_processed (hotel_id, url, cleaned_text, processed_at)
|
||||
VALUES (%s, %s, %s, NOW())
|
||||
ON CONFLICT (hotel_id, url) DO UPDATE SET
|
||||
cleaned_text = EXCLUDED.cleaned_text,
|
||||
processed_at = EXCLUDED.processed_at
|
||||
""", (hotel_id, page['url'], page['text']))
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Ошибка сохранения в БД: {e}")
|
||||
|
||||
|
||||
def get_unprocessed_hotels(limit: int = None) -> List[Dict]:
|
||||
"""Получить необработанные отели"""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT id, full_name, region_name, website_address
|
||||
FROM hotel_main
|
||||
WHERE website_address IS NOT NULL
|
||||
AND website_address != ''
|
||||
AND id NOT IN (SELECT hotel_id FROM hotel_website_meta)
|
||||
ORDER BY region_name, full_name
|
||||
"""
|
||||
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cur.execute(query)
|
||||
|
||||
hotels = []
|
||||
for row in cur.fetchall():
|
||||
hotels.append({
|
||||
'id': row[0],
|
||||
'full_name': row[1],
|
||||
'region_name': row[2],
|
||||
'website_address': row[3]
|
||||
})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return hotels
|
||||
|
||||
|
||||
async def main():
|
||||
"""Главная функция"""
|
||||
logger.info("🚀 Запуск массового краулинга")
|
||||
|
||||
# Получаем необработанные отели
|
||||
hotels = get_unprocessed_hotels()
|
||||
total = len(hotels)
|
||||
|
||||
logger.info(f"📊 Найдено необработанных отелей: {total}")
|
||||
|
||||
if total == 0:
|
||||
logger.info("✅ Все отели уже обработаны!")
|
||||
return
|
||||
|
||||
# Запускаем краулинг
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_CONCURRENT)
|
||||
|
||||
processed = 0
|
||||
success = 0
|
||||
|
||||
# Обрабатываем пачками
|
||||
for i in range(0, total, BATCH_SIZE):
|
||||
batch = hotels[i:i + BATCH_SIZE]
|
||||
|
||||
logger.info(f"\n📦 Обработка пачки {i//BATCH_SIZE + 1}/{(total + BATCH_SIZE - 1)//BATCH_SIZE}")
|
||||
logger.info(f" Отели {i+1}-{min(i+BATCH_SIZE, total)} из {total}")
|
||||
|
||||
tasks = [crawl_hotel(hotel, semaphore, browser) for hotel in batch]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
batch_success = sum(1 for r in results if r is True)
|
||||
success += batch_success
|
||||
processed += len(batch)
|
||||
|
||||
logger.info(f"✅ Пачка завершена: {batch_success}/{len(batch)} успешно")
|
||||
logger.info(f"📊 Общий прогресс: {processed}/{total} ({processed*100//total}%)")
|
||||
|
||||
await browser.close()
|
||||
|
||||
logger.info(f"\n🎉 КРАУЛИНГ ЗАВЕРШЁН!")
|
||||
logger.info(f" Всего обработано: {processed}")
|
||||
logger.info(f" Успешно: {success} ({success*100//processed}%)")
|
||||
logger.info(f" Ошибок: {processed - success}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("\n⚠️ Прервано пользователем")
|
||||
sys.exit(0)
|
||||
|
||||
Reference in New Issue
Block a user