- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
282 lines
11 KiB
Python
282 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Пересканирование отелей, у которых было собрано ровно 10 страниц (старый лимит)
|
||
Теперь соберем до 20 страниц с каждого
|
||
"""
|
||
|
||
import asyncio
|
||
import psycopg2
|
||
from psycopg2.extras import Json
|
||
from urllib.parse import unquote, urlparse
|
||
from playwright.async_api import async_playwright
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import Set, List, Dict
|
||
import sys
|
||
|
||
# Конфигурация БД
|
||
DB_CONFIG = {
|
||
'host': "147.45.189.234",
|
||
'port': 5432,
|
||
'database': "default_db",
|
||
'user': "gen_user",
|
||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||
}
|
||
|
||
# Настройки краулинга
|
||
MAX_PAGES_PER_SITE = 20
|
||
PAGE_TIMEOUT = 20000
|
||
MAX_CONCURRENT = 3 # Меньше чтобы не мешать основному краулеру
|
||
|
||
# Логирование
|
||
log_filename = f'rescan_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(log_filename),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class TextCleaner:
|
||
"""Очистка HTML"""
|
||
|
||
@classmethod
|
||
def clean_html(cls, html: str) -> str:
|
||
"""Очистка HTML от мусора"""
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# Удаляем скрипты, стили и прочее
|
||
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
||
tag.decompose()
|
||
|
||
text = soup.get_text(separator=' ', strip=True)
|
||
text = re.sub(r'\s+', ' ', text)
|
||
return text.strip()
|
||
|
||
|
||
def get_hotels_to_rescan():
|
||
"""Получить список отелей для пересканирования"""
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor()
|
||
|
||
# Получаем отели с ровно 10 страницами
|
||
cur.execute('''
|
||
SELECT DISTINCT p.hotel_id, m.full_name, m.website_address
|
||
FROM hotel_website_processed p
|
||
JOIN hotel_main m ON p.hotel_id = m.id
|
||
WHERE p.hotel_id IN (
|
||
SELECT hotel_id
|
||
FROM hotel_website_processed
|
||
GROUP BY hotel_id
|
||
HAVING COUNT(*) = 10
|
||
)
|
||
ORDER BY p.hotel_id
|
||
''')
|
||
|
||
hotels = cur.fetchall()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return [{'hotel_id': h[0], 'name': h[1], 'website': h[2]} for h in hotels]
|
||
|
||
|
||
async def crawl_hotel(hotel: Dict, semaphore: asyncio.Semaphore, playwright):
|
||
"""Краулинг одного отеля"""
|
||
async with semaphore:
|
||
hotel_id = hotel['hotel_id']
|
||
hotel_name = hotel['name']
|
||
website = hotel['website']
|
||
|
||
if not website:
|
||
logger.warning(f" ⚠️ Нет сайта для {hotel_name}")
|
||
return False
|
||
|
||
logger.info(f"🏨 Пересканирую: {hotel_name}")
|
||
logger.info(f" URL: {website}")
|
||
|
||
# Сначала удалим старые данные
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor()
|
||
|
||
cur.execute("DELETE FROM hotel_website_processed WHERE hotel_id = %s", (hotel_id,))
|
||
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
||
conn.commit()
|
||
|
||
logger.info(f" 🗑️ Удалены старые данные (10 страниц)")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
# Теперь запускаем полный краулинг
|
||
browser = await playwright.chromium.launch(headless=True)
|
||
context = await browser.new_context(
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
)
|
||
|
||
try:
|
||
page = await context.new_page()
|
||
|
||
# Нормализуем URL
|
||
if not website.startswith(('http://', 'https://')):
|
||
base_url = f'https://{website}'
|
||
else:
|
||
base_url = website
|
||
|
||
parsed_base = urlparse(base_url)
|
||
base_domain = parsed_base.netloc
|
||
|
||
# Загружаем главную
|
||
try:
|
||
response = await page.goto(base_url, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
||
if not response or response.status >= 400:
|
||
logger.warning(f" ⚠️ Главная страница недоступна: {response.status if response else 'No response'}")
|
||
await browser.close()
|
||
return False
|
||
except Exception as e:
|
||
logger.error(f" ❌ Ошибка загрузки главной: {e}")
|
||
await browser.close()
|
||
return False
|
||
|
||
# Собираем данные
|
||
pages_data = []
|
||
|
||
# Главная страница
|
||
main_html = await page.content()
|
||
main_text = TextCleaner.clean_html(main_html)
|
||
|
||
pages_data.append({
|
||
'url': base_url,
|
||
'html': main_html,
|
||
'text': main_text,
|
||
'status': response.status
|
||
})
|
||
|
||
logger.info(f" ✅ Главная: {len(main_text)} символов")
|
||
|
||
# Собираем ссылки
|
||
links = await page.evaluate('''() => {
|
||
return Array.from(document.querySelectorAll('a[href]'))
|
||
.map(a => a.href)
|
||
.filter(href => href && !href.startsWith('mailto:') && !href.startsWith('tel:'))
|
||
}''')
|
||
|
||
# Фильтруем только внутренние ссылки
|
||
internal_links = []
|
||
for link in links:
|
||
parsed = urlparse(link)
|
||
if parsed.netloc == base_domain or not parsed.netloc:
|
||
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip('/')
|
||
if clean_url != base_url.rstrip('/') and clean_url not in [p['url'] for p in pages_data]:
|
||
internal_links.append(clean_url)
|
||
|
||
# Убираем дубли
|
||
internal_links = list(dict.fromkeys(internal_links))[:MAX_PAGES_PER_SITE - 1]
|
||
|
||
logger.info(f" 📄 Найдено {len(internal_links)} внутренних ссылок")
|
||
|
||
# Обходим внутренние страницы
|
||
for i, link_url in enumerate(internal_links, 1):
|
||
if len(pages_data) >= MAX_PAGES_PER_SITE:
|
||
break
|
||
|
||
try:
|
||
page2 = await context.new_page()
|
||
response2 = await page2.goto(link_url, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
||
|
||
if response2 and response2.status < 400:
|
||
html2 = await page2.content()
|
||
text2 = TextCleaner.clean_html(html2)
|
||
|
||
pages_data.append({
|
||
'url': link_url,
|
||
'html': html2,
|
||
'text': text2,
|
||
'status': response2.status
|
||
})
|
||
|
||
logger.info(f" ✅ Страница {i}: {len(text2)} символов")
|
||
|
||
await page2.close()
|
||
|
||
except Exception as e:
|
||
logger.warning(f" ⚠️ Ошибка страницы {link_url}: {e}")
|
||
try:
|
||
await page2.close()
|
||
except:
|
||
pass
|
||
|
||
# Сохраняем в БД
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor()
|
||
|
||
for page_data in pages_data:
|
||
# Сохраняем raw
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_raw (hotel_id, url, html, status_code, crawled_at)
|
||
VALUES (%s, %s, %s, %s, NOW())
|
||
ON CONFLICT (hotel_id, url) DO UPDATE
|
||
SET html = EXCLUDED.html, status_code = EXCLUDED.status_code, crawled_at = NOW()
|
||
RETURNING id
|
||
""", (hotel_id, page_data['url'], page_data['html'], page_data['status']))
|
||
|
||
raw_id = cur.fetchone()[0]
|
||
|
||
# Сохраняем processed
|
||
cur.execute("""
|
||
INSERT INTO hotel_website_processed
|
||
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
|
||
VALUES (%s, %s, %s, %s, %s, NOW())
|
||
ON CONFLICT (hotel_id, url) DO UPDATE
|
||
SET cleaned_text = EXCLUDED.cleaned_text, text_length = EXCLUDED.text_length, processed_at = NOW()
|
||
""", (raw_id, hotel_id, page_data['url'], page_data['text'], len(page_data['text'])))
|
||
|
||
conn.commit()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
logger.info(f" 💾 Сохранено {len(pages_data)} страниц для {hotel_name}")
|
||
|
||
await browser.close()
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f" ❌ Ошибка при краулинге {hotel_name}: {e}")
|
||
await browser.close()
|
||
return False
|
||
|
||
|
||
async def main():
|
||
"""Главная функция"""
|
||
logger.info("🚀 Начинаю пересканирование отелей с 10 страницами")
|
||
|
||
hotels = get_hotels_to_rescan()
|
||
logger.info(f"📊 Найдено {len(hotels)} отелей для пересканирования")
|
||
|
||
if not hotels:
|
||
logger.info("✅ Нет отелей для пересканирования")
|
||
return
|
||
|
||
async with async_playwright() as playwright:
|
||
semaphore = asyncio.Semaphore(MAX_CONCURRENT)
|
||
tasks = [crawl_hotel(hotel, semaphore, playwright) for hotel in hotels]
|
||
|
||
results = []
|
||
for i, task in enumerate(asyncio.as_completed(tasks), 1):
|
||
result = await task
|
||
results.append(result)
|
||
logger.info(f"📈 Прогресс: {i}/{len(hotels)} ({i/len(hotels)*100:.1f}%)")
|
||
|
||
success_count = sum(1 for r in results if r)
|
||
logger.info(f"\n✅ Завершено! Успешно: {success_count}/{len(hotels)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|