201 lines
6.5 KiB
Python
201 lines
6.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Обработка сырого HTML в очищенный текст для аудита
|
|||
|
|
Из hotel_website_raw → hotel_website_processed
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor, Json
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
from datetime import datetime
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'process_raw_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
class HTMLProcessor:
|
|||
|
|
"""Обработка HTML в чистый текст"""
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def clean_html(html: str) -> str:
|
|||
|
|
"""Очистка HTML"""
|
|||
|
|
if not html:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
|
|
|||
|
|
# Удаляем скрипты и стили
|
|||
|
|
for tag in soup.find_all(['script', 'style', 'noscript']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
# Получаем текст
|
|||
|
|
text = soup.get_text()
|
|||
|
|
|
|||
|
|
# Очистка
|
|||
|
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
|
text = re.sub(r'\n\s*\n', '\n', text)
|
|||
|
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|||
|
|
|
|||
|
|
return '\n'.join(lines)
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def extract_structured_data(text: str) -> dict:
|
|||
|
|
"""Извлечение структурированных данных"""
|
|||
|
|
data = {
|
|||
|
|
'phones': [],
|
|||
|
|
'emails': [],
|
|||
|
|
'inns': [],
|
|||
|
|
'ogrn': [],
|
|||
|
|
'addresses': [],
|
|||
|
|
'prices': []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Телефоны
|
|||
|
|
phone_patterns = [
|
|||
|
|
r'\+?[78][\s\-\(\)]?\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}',
|
|||
|
|
r'8[\s\-]?800[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}'
|
|||
|
|
]
|
|||
|
|
for pattern in phone_patterns:
|
|||
|
|
data['phones'].extend(re.findall(pattern, text))
|
|||
|
|
|
|||
|
|
# Email
|
|||
|
|
data['emails'] = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
|||
|
|
|
|||
|
|
# ИНН
|
|||
|
|
inns = re.findall(r'\b\d{10,12}\b', text)
|
|||
|
|
data['inns'] = [inn for inn in inns if len(inn) in [10, 12]]
|
|||
|
|
|
|||
|
|
# ОГРН
|
|||
|
|
ogrns = re.findall(r'\b\d{13,15}\b', text)
|
|||
|
|
data['ogrn'] = [ogrn for ogrn in ogrns if len(ogrn) in [13, 15]]
|
|||
|
|
|
|||
|
|
# Цены (руб, ₽)
|
|||
|
|
data['prices'] = re.findall(r'(\d[\d\s]*\d)\s*(?:руб|₽|рублей)', text)
|
|||
|
|
|
|||
|
|
# Удаляем дубликаты
|
|||
|
|
for key in data:
|
|||
|
|
data[key] = list(set(data[key]))[:10] # Максимум 10 значений
|
|||
|
|
|
|||
|
|
return data
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_region(region_name: str):
|
|||
|
|
"""Обработка региона"""
|
|||
|
|
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
cur = conn.cursor(cursor_factory=RealDictCursor)
|
|||
|
|
|
|||
|
|
# Получаем сырые данные для обработки
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT DISTINCT w.hotel_id, h.full_name
|
|||
|
|
FROM hotel_website_raw w
|
|||
|
|
JOIN hotel_main h ON h.id = w.hotel_id
|
|||
|
|
WHERE h.region_name ILIKE %s
|
|||
|
|
AND NOT EXISTS (
|
|||
|
|
SELECT 1 FROM hotel_website_processed p
|
|||
|
|
WHERE p.hotel_id = w.hotel_id
|
|||
|
|
)
|
|||
|
|
ORDER BY h.full_name
|
|||
|
|
''', (f'%{region_name}%',))
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"🔄 ОБРАБОТКА СЫРЫХ ДАННЫХ: {region_name}")
|
|||
|
|
logger.info(f"📊 Отелей для обработки: {len(hotels)}")
|
|||
|
|
logger.info(f"{'='*70}\n")
|
|||
|
|
|
|||
|
|
if len(hotels) == 0:
|
|||
|
|
logger.info("✅ Все данные уже обработаны!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
processed = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
logger.info(f"[{i}/{len(hotels)}] {hotel['full_name']}")
|
|||
|
|
|
|||
|
|
# Получаем все страницы отеля
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT url, html, page_title
|
|||
|
|
FROM hotel_website_raw
|
|||
|
|
WHERE hotel_id = %s
|
|||
|
|
ORDER BY depth, crawled_at
|
|||
|
|
''', (hotel['hotel_id'],))
|
|||
|
|
|
|||
|
|
pages = cur.fetchall()
|
|||
|
|
|
|||
|
|
# Обрабатываем каждую страницу
|
|||
|
|
for page in pages:
|
|||
|
|
# Очищаем HTML
|
|||
|
|
cleaned_text = HTMLProcessor.clean_html(page['html'])
|
|||
|
|
|
|||
|
|
# Извлекаем данные
|
|||
|
|
extracted_data = HTMLProcessor.extract_structured_data(cleaned_text)
|
|||
|
|
|
|||
|
|
# Проверяем есть ли уже
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT id FROM hotel_website_processed
|
|||
|
|
WHERE hotel_id = %s AND url = %s
|
|||
|
|
''', (hotel['hotel_id'], page['url']))
|
|||
|
|
|
|||
|
|
if cur.fetchone():
|
|||
|
|
continue # Уже обработано
|
|||
|
|
|
|||
|
|
# Сохраняем в processed
|
|||
|
|
cur.execute('''
|
|||
|
|
INSERT INTO hotel_website_processed
|
|||
|
|
(hotel_id, url, cleaned_text, extracted_data, text_length, processed_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
''', (
|
|||
|
|
hotel['hotel_id'],
|
|||
|
|
page['url'],
|
|||
|
|
cleaned_text,
|
|||
|
|
Json(extracted_data),
|
|||
|
|
len(cleaned_text),
|
|||
|
|
datetime.now()
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
logger.info(f" ✓ Обработано {len(pages)} страниц")
|
|||
|
|
processed += 1
|
|||
|
|
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"✅ ГОТОВО! Обработано {processed} отелей")
|
|||
|
|
logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
region = sys.argv[1] if len(sys.argv) > 1 else 'Камчатский край'
|
|||
|
|
|
|||
|
|
logger.info(f"📍 Регион: {region}")
|
|||
|
|
process_region(region)
|
|||
|
|
|