Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
140
process_spb_simple.py
Normal file
140
process_spb_simple.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Простая версия: преобразование данных СПб порциями
|
||||
"""
|
||||
|
||||
from urllib.parse import unquote
|
||||
import psycopg2
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# Конфигурация БД
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
# Логирование
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'spb_simple_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def clean_html(html: str) -> str:
|
||||
"""Очистка HTML"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def process_spb_in_batches():
|
||||
"""Обрабатываем СПб порциями по 100 записей"""
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Сначала получим общее количество
|
||||
cur.execute('''
|
||||
SELECT COUNT(*)
|
||||
FROM hotel_website_raw r
|
||||
JOIN hotel_main h ON h.id = r.hotel_id
|
||||
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
AND p.hotel_id IS NULL
|
||||
''')
|
||||
|
||||
total_count = cur.fetchone()[0]
|
||||
logger.info(f"📊 Всего страниц для обработки: {total_count}")
|
||||
|
||||
if total_count == 0:
|
||||
logger.info("✅ Нет данных для обработки")
|
||||
return
|
||||
|
||||
processed = 0
|
||||
batch_size = 100
|
||||
|
||||
while processed < total_count:
|
||||
# Получаем следующую порцию
|
||||
cur.execute('''
|
||||
SELECT
|
||||
r.id as raw_id,
|
||||
r.hotel_id,
|
||||
r.url,
|
||||
r.html
|
||||
FROM hotel_website_raw r
|
||||
JOIN hotel_main h ON h.id = r.hotel_id
|
||||
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
AND p.hotel_id IS NULL
|
||||
ORDER BY r.id
|
||||
LIMIT %s
|
||||
''', (batch_size,))
|
||||
|
||||
batch = cur.fetchall()
|
||||
|
||||
if not batch:
|
||||
break
|
||||
|
||||
logger.info(f"📦 Обрабатываю пачку: {len(batch)} страниц")
|
||||
|
||||
# Обрабатываем пачку
|
||||
for raw_id, hotel_id, url, html in batch:
|
||||
try:
|
||||
cleaned_text = clean_html(html)
|
||||
text_length = len(cleaned_text)
|
||||
|
||||
# Вставляем в processed
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_website_processed
|
||||
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
|
||||
VALUES (%s, %s, %s, %s, %s, NOW())
|
||||
ON CONFLICT (hotel_id, url) DO UPDATE SET
|
||||
cleaned_text = EXCLUDED.cleaned_text,
|
||||
text_length = EXCLUDED.text_length,
|
||||
processed_at = EXCLUDED.processed_at
|
||||
""", (raw_id, hotel_id, url, cleaned_text, text_length))
|
||||
|
||||
processed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Ошибка: {e}")
|
||||
continue
|
||||
|
||||
# Коммитим пачку
|
||||
conn.commit()
|
||||
logger.info(f"✅ Обработано: {processed}/{total_count} ({processed/total_count*100:.1f}%)")
|
||||
|
||||
# Финальная статистика
|
||||
cur.execute('''
|
||||
SELECT COUNT(DISTINCT p.hotel_id)
|
||||
FROM hotel_website_processed p
|
||||
JOIN hotel_main h ON h.id = p.hotel_id
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
''')
|
||||
|
||||
processed_hotels = cur.fetchone()[0]
|
||||
logger.info(f"🎉 ЗАВЕРШЕНО! Отелей СПб в processed: {processed_hotels}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_spb_in_batches()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user