#!/usr/bin/env python3 """ Простая версия: преобразование данных СПб порциями """ from urllib.parse import unquote import psycopg2 from bs4 import BeautifulSoup import re import logging from datetime import datetime # Конфигурация БД DB_CONFIG = { 'host': "147.45.189.234", 'port': 5432, 'database': "default_db", 'user': "gen_user", 'password': unquote("2~~9_%5EkVsU%3F2%5CS") } # Логирование logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f'spb_simple_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def clean_html(html: str) -> str: """Очистка HTML""" soup = BeautifulSoup(html, 'html.parser') for tag in soup(['script', 'style', 'meta', 'link', 'noscript']): tag.decompose() text = soup.get_text(separator=' ', strip=True) text = re.sub(r'\s+', ' ', text) return text.strip() def process_spb_in_batches(): """Обрабатываем СПб порциями по 100 записей""" conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() # Сначала получим общее количество cur.execute(''' SELECT COUNT(*) FROM hotel_website_raw r JOIN hotel_main h ON h.id = r.hotel_id LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url WHERE h.region_name = 'г. Санкт-Петербург' AND p.hotel_id IS NULL ''') total_count = cur.fetchone()[0] logger.info(f"📊 Всего страниц для обработки: {total_count}") if total_count == 0: logger.info("✅ Нет данных для обработки") return processed = 0 batch_size = 100 while processed < total_count: # Получаем следующую порцию cur.execute(''' SELECT r.id as raw_id, r.hotel_id, r.url, r.html FROM hotel_website_raw r JOIN hotel_main h ON h.id = r.hotel_id LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url WHERE h.region_name = 'г. Санкт-Петербург' AND p.hotel_id IS NULL ORDER BY r.id LIMIT %s ''', (batch_size,)) batch = cur.fetchall() if not batch: break logger.info(f"📦 Обрабатываю пачку: {len(batch)} страниц") # Обрабатываем пачку for raw_id, hotel_id, url, html in batch: try: cleaned_text = clean_html(html) text_length = len(cleaned_text) # Вставляем в processed cur.execute(""" INSERT INTO hotel_website_processed (raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at) VALUES (%s, %s, %s, %s, %s, NOW()) ON CONFLICT (hotel_id, url) DO UPDATE SET cleaned_text = EXCLUDED.cleaned_text, text_length = EXCLUDED.text_length, processed_at = EXCLUDED.processed_at """, (raw_id, hotel_id, url, cleaned_text, text_length)) processed += 1 except Exception as e: logger.error(f"❌ Ошибка: {e}") continue # Коммитим пачку conn.commit() logger.info(f"✅ Обработано: {processed}/{total_count} ({processed/total_count*100:.1f}%)") # Финальная статистика cur.execute(''' SELECT COUNT(DISTINCT p.hotel_id) FROM hotel_website_processed p JOIN hotel_main h ON h.id = p.hotel_id WHERE h.region_name = 'г. Санкт-Петербург' ''') processed_hotels = cur.fetchone()[0] logger.info(f"🎉 ЗАВЕРШЕНО! Отелей СПб в processed: {processed_hotels}") cur.close() conn.close() if __name__ == "__main__": process_spb_in_batches()