282 lines
11 KiB
Python
282 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Пересканирование отелей, у которых было собрано ровно 10 страниц (старый лимит)
|
|||
|
|
Теперь соберем до 20 страниц с каждого
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import Json
|
|||
|
|
from urllib.parse import unquote, urlparse
|
|||
|
|
from playwright.async_api import async_playwright
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import re
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import Set, List, Dict
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Настройки краулинга
|
|||
|
|
MAX_PAGES_PER_SITE = 20
|
|||
|
|
PAGE_TIMEOUT = 20000
|
|||
|
|
MAX_CONCURRENT = 3 # Меньше чтобы не мешать основному краулеру
|
|||
|
|
|
|||
|
|
# Логирование
|
|||
|
|
log_filename = f'rescan_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(log_filename),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextCleaner:
|
|||
|
|
"""Очистка HTML"""
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def clean_html(cls, html: str) -> str:
|
|||
|
|
"""Очистка HTML от мусора"""
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты, стили и прочее
|
|||
|
|
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
text = soup.get_text(separator=' ', strip=True)
|
|||
|
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_hotels_to_rescan():
|
|||
|
|
"""Получить список отелей для пересканирования"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Получаем отели с ровно 10 страницами
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT DISTINCT p.hotel_id, m.full_name, m.website_address
|
|||
|
|
FROM hotel_website_processed p
|
|||
|
|
JOIN hotel_main m ON p.hotel_id = m.id
|
|||
|
|
WHERE p.hotel_id IN (
|
|||
|
|
SELECT hotel_id
|
|||
|
|
FROM hotel_website_processed
|
|||
|
|
GROUP BY hotel_id
|
|||
|
|
HAVING COUNT(*) = 10
|
|||
|
|
)
|
|||
|
|
ORDER BY p.hotel_id
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return [{'hotel_id': h[0], 'name': h[1], 'website': h[2]} for h in hotels]
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def crawl_hotel(hotel: Dict, semaphore: asyncio.Semaphore, playwright):
|
|||
|
|
"""Краулинг одного отеля"""
|
|||
|
|
async with semaphore:
|
|||
|
|
hotel_id = hotel['hotel_id']
|
|||
|
|
hotel_name = hotel['name']
|
|||
|
|
website = hotel['website']
|
|||
|
|
|
|||
|
|
if not website:
|
|||
|
|
logger.warning(f" ⚠️ Нет сайта для {hotel_name}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
logger.info(f"🏨 Пересканирую: {hotel_name}")
|
|||
|
|
logger.info(f" URL: {website}")
|
|||
|
|
|
|||
|
|
# Сначала удалим старые данные
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute("DELETE FROM hotel_website_processed WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
logger.info(f" 🗑️ Удалены старые данные (10 страниц)")
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
# Теперь запускаем полный краулинг
|
|||
|
|
browser = await playwright.chromium.launch(headless=True)
|
|||
|
|
context = await browser.new_context(
|
|||
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
page = await context.new_page()
|
|||
|
|
|
|||
|
|
# Нормализуем URL
|
|||
|
|
if not website.startswith(('http://', 'https://')):
|
|||
|
|
base_url = f'https://{website}'
|
|||
|
|
else:
|
|||
|
|
base_url = website
|
|||
|
|
|
|||
|
|
parsed_base = urlparse(base_url)
|
|||
|
|
base_domain = parsed_base.netloc
|
|||
|
|
|
|||
|
|
# Загружаем главную
|
|||
|
|
try:
|
|||
|
|
response = await page.goto(base_url, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
|||
|
|
if not response or response.status >= 400:
|
|||
|
|
logger.warning(f" ⚠️ Главная страница недоступна: {response.status if response else 'No response'}")
|
|||
|
|
await browser.close()
|
|||
|
|
return False
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ❌ Ошибка загрузки главной: {e}")
|
|||
|
|
await browser.close()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Собираем данные
|
|||
|
|
pages_data = []
|
|||
|
|
|
|||
|
|
# Главная страница
|
|||
|
|
main_html = await page.content()
|
|||
|
|
main_text = TextCleaner.clean_html(main_html)
|
|||
|
|
|
|||
|
|
pages_data.append({
|
|||
|
|
'url': base_url,
|
|||
|
|
'html': main_html,
|
|||
|
|
'text': main_text,
|
|||
|
|
'status': response.status
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
logger.info(f" ✅ Главная: {len(main_text)} символов")
|
|||
|
|
|
|||
|
|
# Собираем ссылки
|
|||
|
|
links = await page.evaluate('''() => {
|
|||
|
|
return Array.from(document.querySelectorAll('a[href]'))
|
|||
|
|
.map(a => a.href)
|
|||
|
|
.filter(href => href && !href.startsWith('mailto:') && !href.startsWith('tel:'))
|
|||
|
|
}''')
|
|||
|
|
|
|||
|
|
# Фильтруем только внутренние ссылки
|
|||
|
|
internal_links = []
|
|||
|
|
for link in links:
|
|||
|
|
parsed = urlparse(link)
|
|||
|
|
if parsed.netloc == base_domain or not parsed.netloc:
|
|||
|
|
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip('/')
|
|||
|
|
if clean_url != base_url.rstrip('/') and clean_url not in [p['url'] for p in pages_data]:
|
|||
|
|
internal_links.append(clean_url)
|
|||
|
|
|
|||
|
|
# Убираем дубли
|
|||
|
|
internal_links = list(dict.fromkeys(internal_links))[:MAX_PAGES_PER_SITE - 1]
|
|||
|
|
|
|||
|
|
logger.info(f" 📄 Найдено {len(internal_links)} внутренних ссылок")
|
|||
|
|
|
|||
|
|
# Обходим внутренние страницы
|
|||
|
|
for i, link_url in enumerate(internal_links, 1):
|
|||
|
|
if len(pages_data) >= MAX_PAGES_PER_SITE:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
page2 = await context.new_page()
|
|||
|
|
response2 = await page2.goto(link_url, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT)
|
|||
|
|
|
|||
|
|
if response2 and response2.status < 400:
|
|||
|
|
html2 = await page2.content()
|
|||
|
|
text2 = TextCleaner.clean_html(html2)
|
|||
|
|
|
|||
|
|
pages_data.append({
|
|||
|
|
'url': link_url,
|
|||
|
|
'html': html2,
|
|||
|
|
'text': text2,
|
|||
|
|
'status': response2.status
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
logger.info(f" ✅ Страница {i}: {len(text2)} символов")
|
|||
|
|
|
|||
|
|
await page2.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f" ⚠️ Ошибка страницы {link_url}: {e}")
|
|||
|
|
try:
|
|||
|
|
await page2.close()
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# Сохраняем в БД
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
for page_data in pages_data:
|
|||
|
|
# Сохраняем raw
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_raw (hotel_id, url, html, status_code, crawled_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, NOW())
|
|||
|
|
ON CONFLICT (hotel_id, url) DO UPDATE
|
|||
|
|
SET html = EXCLUDED.html, status_code = EXCLUDED.status_code, crawled_at = NOW()
|
|||
|
|
RETURNING id
|
|||
|
|
""", (hotel_id, page_data['url'], page_data['html'], page_data['status']))
|
|||
|
|
|
|||
|
|
raw_id = cur.fetchone()[0]
|
|||
|
|
|
|||
|
|
# Сохраняем processed
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_processed
|
|||
|
|
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, NOW())
|
|||
|
|
ON CONFLICT (hotel_id, url) DO UPDATE
|
|||
|
|
SET cleaned_text = EXCLUDED.cleaned_text, text_length = EXCLUDED.text_length, processed_at = NOW()
|
|||
|
|
""", (raw_id, hotel_id, page_data['url'], page_data['text'], len(page_data['text'])))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
logger.info(f" 💾 Сохранено {len(pages_data)} страниц для {hotel_name}")
|
|||
|
|
|
|||
|
|
await browser.close()
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ❌ Ошибка при краулинге {hotel_name}: {e}")
|
|||
|
|
await browser.close()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def main():
|
|||
|
|
"""Главная функция"""
|
|||
|
|
logger.info("🚀 Начинаю пересканирование отелей с 10 страницами")
|
|||
|
|
|
|||
|
|
hotels = get_hotels_to_rescan()
|
|||
|
|
logger.info(f"📊 Найдено {len(hotels)} отелей для пересканирования")
|
|||
|
|
|
|||
|
|
if not hotels:
|
|||
|
|
logger.info("✅ Нет отелей для пересканирования")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
async with async_playwright() as playwright:
|
|||
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT)
|
|||
|
|
tasks = [crawl_hotel(hotel, semaphore, playwright) for hotel in hotels]
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
for i, task in enumerate(asyncio.as_completed(tasks), 1):
|
|||
|
|
result = await task
|
|||
|
|
results.append(result)
|
|||
|
|
logger.info(f"📈 Прогресс: {i}/{len(hotels)} ({i/len(hotels)*100:.1f}%)")
|
|||
|
|
|
|||
|
|
success_count = sum(1 for r in results if r)
|
|||
|
|
logger.info(f"\n✅ Завершено! Успешно: {success_count}/{len(hotels)}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|
|||
|
|
|