2025-10-16 10:52:09 +03:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
Устойчивая версия обработки СПб с retry логикой
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
import psycopg2
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
import re
|
|
|
|
|
|
import logging
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
# Конфигурация БД
|
|
|
|
|
|
DB_CONFIG = {
|
|
|
|
|
|
'host': "147.45.189.234",
|
|
|
|
|
|
'port': 5432,
|
|
|
|
|
|
'database': "default_db",
|
|
|
|
|
|
'user': "gen_user",
|
|
|
|
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS"),
|
|
|
|
|
|
'connect_timeout': 10,
|
|
|
|
|
|
'keepalives_idle': 600,
|
|
|
|
|
|
'keepalives_interval': 30,
|
|
|
|
|
|
'keepalives_count': 3
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Логирование
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
|
|
|
handlers=[
|
|
|
|
|
|
logging.FileHandler(f'spb_robust_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|
|
|
|
|
logging.StreamHandler()
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_html(html: str) -> str:
|
|
|
|
|
|
"""Очистка HTML"""
|
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
|
|
for tag in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
|
|
|
|
|
tag.decompose()
|
|
|
|
|
|
text = soup.get_text(separator=' ', strip=True)
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_connection():
|
|
|
|
|
|
"""Получить соединение с retry"""
|
|
|
|
|
|
for attempt in range(3):
|
|
|
|
|
|
try:
|
|
|
|
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
|
|
|
|
return conn
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"Попытка {attempt + 1} подключения: {e}")
|
|
|
|
|
|
if attempt < 2:
|
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_spb_robust():
|
|
|
|
|
|
"""Обрабатываем СПб с устойчивостью к сбоям"""
|
|
|
|
|
|
|
|
|
|
|
|
conn = None
|
|
|
|
|
|
total_processed = 0
|
|
|
|
|
|
batch_size = 50 # Уменьшили размер пачки
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
while True:
|
|
|
|
|
|
try:
|
|
|
|
|
|
# Получаем новое соединение для каждой итерации
|
|
|
|
|
|
if conn:
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
conn = get_connection()
|
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
|
|
# Получаем следующую порцию
|
|
|
|
|
|
cur.execute('''
|
|
|
|
|
|
SELECT
|
|
|
|
|
|
r.id as raw_id,
|
|
|
|
|
|
r.hotel_id,
|
|
|
|
|
|
r.url,
|
|
|
|
|
|
r.html
|
|
|
|
|
|
FROM hotel_website_raw r
|
|
|
|
|
|
JOIN hotel_main h ON h.id = r.hotel_id
|
|
|
|
|
|
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
|
|
|
|
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|
|
|
|
|
AND p.hotel_id IS NULL
|
|
|
|
|
|
ORDER BY r.id
|
|
|
|
|
|
LIMIT %s
|
|
|
|
|
|
''', (batch_size,))
|
|
|
|
|
|
|
|
|
|
|
|
batch = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
|
|
if not batch:
|
|
|
|
|
|
logger.info("🎉 Все данные обработаны!")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"📦 Обрабатываю пачку: {len(batch)} страниц")
|
|
|
|
|
|
|
|
|
|
|
|
# Обрабатываем пачку
|
|
|
|
|
|
for raw_id, hotel_id, url, html in batch:
|
|
|
|
|
|
try:
|
|
|
|
|
|
cleaned_text = clean_html(html)
|
|
|
|
|
|
text_length = len(cleaned_text)
|
|
|
|
|
|
|
|
|
|
|
|
# Вставляем в processed
|
|
|
|
|
|
cur.execute("""
|
|
|
|
|
|
INSERT INTO hotel_website_processed
|
|
|
|
|
|
(raw_page_id, hotel_id, url, cleaned_text, text_length, processed_at)
|
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, NOW())
|
|
|
|
|
|
ON CONFLICT (hotel_id, url) DO UPDATE SET
|
|
|
|
|
|
cleaned_text = EXCLUDED.cleaned_text,
|
|
|
|
|
|
text_length = EXCLUDED.text_length,
|
|
|
|
|
|
processed_at = EXCLUDED.processed_at
|
|
|
|
|
|
""", (raw_id, hotel_id, url, cleaned_text, text_length))
|
|
|
|
|
|
|
|
|
|
|
|
total_processed += 1
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"❌ Ошибка обработки {hotel_id} {url}: {e}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Коммитим пачку
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
cur.close()
|
|
|
|
|
|
|
|
|
|
|
|
# Проверяем прогресс
|
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
cur.execute('''
|
|
|
|
|
|
SELECT COUNT(*)
|
|
|
|
|
|
FROM hotel_website_raw r
|
|
|
|
|
|
JOIN hotel_main h ON h.id = r.hotel_id
|
|
|
|
|
|
LEFT JOIN hotel_website_processed p ON p.hotel_id = r.hotel_id AND p.url = r.url
|
|
|
|
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|
|
|
|
|
AND p.hotel_id IS NULL
|
|
|
|
|
|
''')
|
|
|
|
|
|
|
|
|
|
|
|
remaining = cur.fetchone()[0]
|
|
|
|
|
|
total_pages = total_processed + remaining
|
|
|
|
|
|
percent = total_processed / total_pages * 100 if total_pages > 0 else 0
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"✅ Обработано: {total_processed}/{total_pages} ({percent:.1f}%)")
|
|
|
|
|
|
cur.close()
|
|
|
|
|
|
|
|
|
|
|
|
# Небольшая пауза между пачками
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"❌ Ошибка в цикле: {e}")
|
|
|
|
|
|
if conn:
|
|
|
|
|
|
try:
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
conn = None
|
|
|
|
|
|
time.sleep(10) # Пауза перед повтором
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Финальная статистика
|
|
|
|
|
|
conn = get_connection()
|
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
|
|
cur.execute('''
|
|
|
|
|
|
SELECT COUNT(DISTINCT p.hotel_id)
|
|
|
|
|
|
FROM hotel_website_processed p
|
|
|
|
|
|
JOIN hotel_main h ON h.id = p.hotel_id
|
|
|
|
|
|
WHERE h.region_name = 'г. Санкт-Петербург'
|
|
|
|
|
|
''')
|
|
|
|
|
|
|
|
|
|
|
|
processed_hotels = cur.fetchone()[0]
|
|
|
|
|
|
logger.info(f"🎉 ЗАВЕРШЕНО! Отелей СПб в processed: {processed_hotels}")
|
|
|
|
|
|
|
|
|
|
|
|
cur.close()
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
if conn:
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
process_spb_robust()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-10-27 22:49:42 +03:00
|
|
|
|
|