186 lines
5.9 KiB
Python
186 lines
5.9 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Преобразование данных Санкт-Петербурга из hotel_website_raw в hotel_website_processed
|
|||
|
|
Обрабатывает 807 отелей СПб, которые есть в raw, но нет в processed
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
import psycopg2
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import re
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import List, Dict
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Логирование
|
|||
|
|
log_filename = f'process_spb_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(log_filename),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextCleaner:
|
|||
|
|
"""Очистка HTML"""
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def clean_html(cls, html: str) -> str:
|
|||
|
|
"""Очистка HTML от мусора"""
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты, стили и прочее
|
|||
|
|
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
text = soup.get_text(separator=' ', strip=True)
|
|||
|
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_spb_raw_data():
|
|||
|
|
"""Получить все raw данные СПб, которых нет в processed"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT
|
|||
|
|
r.id as raw_id,
|
|||
|
|
r.hotel_id,
|
|||
|
|
r.url,
|
|||
|
|
r.html,
|
|||
|
|
r.status_code,
|
|||
|
|
r.crawled_at,
|
|||
|
|
h.full_name
|
|||
|
|
FROM hotel_website_raw r
|
|||
|
|
JOIN hotel_main h ON h.id = r.hotel_id
|
|||
|
|
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
|
|||
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|||
|
|
AND p.hotel_id IS NULL
|
|||
|
|
ORDER BY r.hotel_id, r.url
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
raw_data = cur.fetchall()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return raw_data
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_batch(raw_data_batch: List):
|
|||
|
|
"""Обработать пачку данных"""
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
processed_count = 0
|
|||
|
|
batch_start_time = datetime.now()
|
|||
|
|
|
|||
|
|
for raw_id, hotel_id, url, html, status_code, crawled_at, hotel_name in raw_data_batch:
|
|||
|
|
try:
|
|||
|
|
# Очищаем HTML
|
|||
|
|
cleaned_text = TextCleaner.clean_html(html)
|
|||
|
|
text_length = len(cleaned_text)
|
|||
|
|
|
|||
|
|
# Вставляем в processed
|
|||
|
|
cur.execute("""
|
|||
|
|
INSERT INTO hotel_website_processed
|
|||
|
|
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, NOW())
|
|||
|
|
ON CONFLICT (hotel_id, url) DO UPDATE SET
|
|||
|
|
cleaned_text = EXCLUDED.cleaned_text,
|
|||
|
|
text_length = EXCLUDED.text_length,
|
|||
|
|
processed_at = EXCLUDED.processed_at
|
|||
|
|
""", (raw_id, hotel_id, url, cleaned_text, text_length))
|
|||
|
|
|
|||
|
|
processed_count += 1
|
|||
|
|
|
|||
|
|
if processed_count % 100 == 0:
|
|||
|
|
logger.info(f" ✅ Обработано {processed_count} страниц...")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ❌ Ошибка обработки {hotel_id} {url}: {e}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
batch_time = (datetime.now() - batch_start_time).total_seconds()
|
|||
|
|
logger.info(f" ⏱️ Пачка обработана за {batch_time:.1f} сек")
|
|||
|
|
|
|||
|
|
return processed_count
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""Главная функция"""
|
|||
|
|
logger.info("🚀 Начинаю обработку данных СПб из raw в processed")
|
|||
|
|
|
|||
|
|
# Получаем данные
|
|||
|
|
raw_data = get_spb_raw_data()
|
|||
|
|
total_pages = len(raw_data)
|
|||
|
|
|
|||
|
|
logger.info(f"📊 Найдено {total_pages} страниц для обработки")
|
|||
|
|
|
|||
|
|
if not raw_data:
|
|||
|
|
logger.info("✅ Нет данных для обработки")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Группируем по отелям для статистики
|
|||
|
|
hotels = {}
|
|||
|
|
for _, hotel_id, _, _, _, _, hotel_name in raw_data:
|
|||
|
|
if hotel_id not in hotels:
|
|||
|
|
hotels[hotel_id] = hotel_name
|
|||
|
|
|
|||
|
|
logger.info(f"🏨 Всего отелей: {len(hotels)}")
|
|||
|
|
|
|||
|
|
# Обрабатываем пачками по 50 (меньше памяти)
|
|||
|
|
batch_size = 50
|
|||
|
|
total_processed = 0
|
|||
|
|
|
|||
|
|
for i in range(0, total_pages, batch_size):
|
|||
|
|
batch = raw_data[i:i + batch_size]
|
|||
|
|
logger.info(f"📦 Обрабатываю пачку {i//batch_size + 1}: страницы {i+1}-{min(i+batch_size, total_pages)}")
|
|||
|
|
|
|||
|
|
processed_count = process_batch(batch)
|
|||
|
|
total_processed += processed_count
|
|||
|
|
|
|||
|
|
logger.info(f" ✅ Пачка завершена: {processed_count} страниц")
|
|||
|
|
|
|||
|
|
logger.info(f"\n🎉 ОБРАБОТКА ЗАВЕРШЕНА!")
|
|||
|
|
logger.info(f" Всего обработано: {total_processed}/{total_pages} страниц")
|
|||
|
|
logger.info(f" Отелей: {len(hotels)}")
|
|||
|
|
|
|||
|
|
# Проверяем результат
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT COUNT(DISTINCT p.hotel_id)
|
|||
|
|
FROM hotel_website_processed p
|
|||
|
|
JOIN hotel_main h ON h.id = p.hotel_id
|
|||
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
processed_hotels = cur.fetchone()[0]
|
|||
|
|
logger.info(f" 📊 Итого отелей СПб в processed: {processed_hotels}")
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|