Files
hotels/process_spb_simple.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

142 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Простая версия: преобразование данных СПб порциями
"""
from urllib.parse import unquote
import psycopg2
from bs4 import BeautifulSoup
import re
import logging
from datetime import datetime
# Конфигурация БД
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
# Логирование
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'spb_simple_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def clean_html(html: str) -> str:
"""Очистка HTML"""
soup = BeautifulSoup(html, 'html.parser')
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def process_spb_in_batches():
"""Обрабатываем СПб порциями по 100 записей"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
# Сначала получим общее количество
cur.execute('''
SELECT COUNT(*)
FROM hotel_website_raw r
JOIN hotel_main h ON h.id = r.hotel_id
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
WHERE h.region_name = 'г. Санкт-Петербург'
AND p.hotel_id IS NULL
''')
total_count = cur.fetchone()[0]
logger.info(f"📊 Всего страниц для обработки: {total_count}")
if total_count == 0:
logger.info("✅ Нет данных для обработки")
return
processed = 0
batch_size = 100
while processed < total_count:
# Получаем следующую порцию
cur.execute('''
SELECT
r.id as raw_id,
r.hotel_id,
r.url,
r.html
FROM hotel_website_raw r
JOIN hotel_main h ON h.id = r.hotel_id
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
WHERE h.region_name = 'г. Санкт-Петербург'
AND p.hotel_id IS NULL
ORDER BY r.id
LIMIT %s
''', (batch_size,))
batch = cur.fetchall()
if not batch:
break
logger.info(f"📦 Обрабатываю пачку: {len(batch)} страниц")
# Обрабатываем пачку
for raw_id, hotel_id, url, html in batch:
try:
cleaned_text = clean_html(html)
text_length = len(cleaned_text)
# Вставляем в processed
cur.execute("""
INSERT INTO hotel_website_processed
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
VALUES (%s, %s, %s, %s, %s, NOW())
ON CONFLICT (hotel_id, url) DO UPDATE SET
cleaned_text = EXCLUDED.cleaned_text,
text_length = EXCLUDED.text_length,
processed_at = EXCLUDED.processed_at
""", (raw_id, hotel_id, url, cleaned_text, text_length))
processed += 1
except Exception as e:
logger.error(f"❌ Ошибка: {e}")
continue
# Коммитим пачку
conn.commit()
logger.info(f"✅ Обработано: {processed}/{total_count} ({processed/total_count*100:.1f}%)")
# Финальная статистика
cur.execute('''
SELECT COUNT(DISTINCT p.hotel_id)
FROM hotel_website_processed p
JOIN hotel_main h ON h.id = p.hotel_id
WHERE h.region_name = 'г. Санкт-Петербург'
''')
processed_hotels = cur.fetchone()[0]
logger.info(f"🎉 ЗАВЕРШЕНО! Отелей СПб в processed: {processed_hotels}")
cur.close()
conn.close()
if __name__ == "__main__":
process_spb_in_batches()