Files
hotels/process_spb_simple.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

141 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Простая версия: преобразование данных СПб порциями
"""
from urllib.parse import unquote
import psycopg2
from bs4 import BeautifulSoup
import re
import logging
from datetime import datetime
# Конфигурация БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
# Логирование
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'spb_simple_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def clean_html(html: str) -> str:
"""Очистка HTML"""
soup = BeautifulSoup(html, 'html.parser')
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def process_spb_in_batches():
"""Обрабатываем СПб порциями по 100 записей"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
# Сначала получим общее количество
cur.execute('''
SELECT COUNT(*)
FROM hotel_website_raw r
JOIN hotel_main h ON h.id = r.hotel_id
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
WHERE h.region_name = 'г. Санкт-Петербург'
AND p.hotel_id IS NULL
''')
total_count = cur.fetchone()[0]
logger.info(f"📊 Всего страниц для обработки: {total_count}")
if total_count == 0:
logger.info("✅ Нет данных для обработки")
return
processed = 0
batch_size = 100
while processed < total_count:
# Получаем следующую порцию
cur.execute('''
SELECT
r.id as raw_id,
r.hotel_id,
r.url,
r.html
FROM hotel_website_raw r
JOIN hotel_main h ON h.id = r.hotel_id
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
WHERE h.region_name = 'г. Санкт-Петербург'
AND p.hotel_id IS NULL
ORDER BY r.id
LIMIT %s
''', (batch_size,))
batch = cur.fetchall()
if not batch:
break
logger.info(f"📦 Обрабатываю пачку: {len(batch)} страниц")
# Обрабатываем пачку
for raw_id, hotel_id, url, html in batch:
try:
cleaned_text = clean_html(html)
text_length = len(cleaned_text)
# Вставляем в processed
cur.execute("""
INSERT INTO hotel_website_processed
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
VALUES (%s, %s, %s, %s, %s, NOW())
ON CONFLICT (hotel_id, url) DO UPDATE SET
cleaned_text = EXCLUDED.cleaned_text,
text_length = EXCLUDED.text_length,
processed_at = EXCLUDED.processed_at
""", (raw_id, hotel_id, url, cleaned_text, text_length))
processed += 1
except Exception as e:
logger.error(f"❌ Ошибка: {e}")
continue
# Коммитим пачку
conn.commit()
logger.info(f"✅ Обработано: {processed}/{total_count} ({processed/total_count*100:.1f}%)")
# Финальная статистика
cur.execute('''
SELECT COUNT(DISTINCT p.hotel_id)
FROM hotel_website_processed p
JOIN hotel_main h ON h.id = p.hotel_id
WHERE h.region_name = 'г. Санкт-Петербург'
''')
processed_hotels = cur.fetchone()[0]
logger.info(f"🎉 ЗАВЕРШЕНО! Отелей СПб в processed: {processed_hotels}")
cur.close()
conn.close()
if __name__ == "__main__":
process_spb_in_batches()