- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
201 lines
6.5 KiB
Python
201 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Обработка сырого HTML в очищенный текст для аудита
|
||
Из hotel_website_raw → hotel_website_processed
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor, Json
|
||
from urllib.parse import unquote
|
||
from datetime import datetime
|
||
import logging
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
|
||
# Настройка логирования
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(f'process_raw_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Конфигурация БД
|
||
DB_CONFIG = {
|
||
'host': "147.45.189.234",
|
||
'port': 5432,
|
||
'database': "default_db",
|
||
'user': "gen_user",
|
||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||
}
|
||
|
||
|
||
class HTMLProcessor:
|
||
"""Обработка HTML в чистый текст"""
|
||
|
||
@staticmethod
|
||
def clean_html(html: str) -> str:
|
||
"""Очистка HTML"""
|
||
if not html:
|
||
return ""
|
||
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# Удаляем скрипты и стили
|
||
for tag in soup.find_all(['script', 'style', 'noscript']):
|
||
tag.decompose()
|
||
|
||
# Получаем текст
|
||
text = soup.get_text()
|
||
|
||
# Очистка
|
||
text = re.sub(r'\s+', ' ', text)
|
||
text = re.sub(r'\n\s*\n', '\n', text)
|
||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||
|
||
return '\n'.join(lines)
|
||
|
||
@staticmethod
|
||
def extract_structured_data(text: str) -> dict:
|
||
"""Извлечение структурированных данных"""
|
||
data = {
|
||
'phones': [],
|
||
'emails': [],
|
||
'inns': [],
|
||
'ogrn': [],
|
||
'addresses': [],
|
||
'prices': []
|
||
}
|
||
|
||
# Телефоны
|
||
phone_patterns = [
|
||
r'\+?[78][\s\-\(\)]?\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}',
|
||
r'8[\s\-]?800[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}'
|
||
]
|
||
for pattern in phone_patterns:
|
||
data['phones'].extend(re.findall(pattern, text))
|
||
|
||
# Email
|
||
data['emails'] = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
||
|
||
# ИНН
|
||
inns = re.findall(r'\b\d{10,12}\b', text)
|
||
data['inns'] = [inn for inn in inns if len(inn) in [10, 12]]
|
||
|
||
# ОГРН
|
||
ogrns = re.findall(r'\b\d{13,15}\b', text)
|
||
data['ogrn'] = [ogrn for ogrn in ogrns if len(ogrn) in [13, 15]]
|
||
|
||
# Цены (руб, ₽)
|
||
data['prices'] = re.findall(r'(\d[\d\s]*\d)\s*(?:руб|₽|рублей)', text)
|
||
|
||
# Удаляем дубликаты
|
||
for key in data:
|
||
data[key] = list(set(data[key]))[:10] # Максимум 10 значений
|
||
|
||
return data
|
||
|
||
|
||
def process_region(region_name: str):
|
||
"""Обработка региона"""
|
||
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
|
||
try:
|
||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
|
||
# Получаем сырые данные для обработки
|
||
cur.execute('''
|
||
SELECT DISTINCT w.hotel_id, h.full_name
|
||
FROM hotel_website_raw w
|
||
JOIN hotel_main h ON h.id = w.hotel_id
|
||
WHERE h.region_name ILIKE %s
|
||
AND NOT EXISTS (
|
||
SELECT 1 FROM hotel_website_processed p
|
||
WHERE p.hotel_id = w.hotel_id
|
||
)
|
||
ORDER BY h.full_name
|
||
''', (f'%{region_name}%',))
|
||
|
||
hotels = cur.fetchall()
|
||
|
||
logger.info(f"\n{'='*70}")
|
||
logger.info(f"🔄 ОБРАБОТКА СЫРЫХ ДАННЫХ: {region_name}")
|
||
logger.info(f"📊 Отелей для обработки: {len(hotels)}")
|
||
logger.info(f"{'='*70}\n")
|
||
|
||
if len(hotels) == 0:
|
||
logger.info("✅ Все данные уже обработаны!")
|
||
return
|
||
|
||
processed = 0
|
||
|
||
for i, hotel in enumerate(hotels, 1):
|
||
logger.info(f"[{i}/{len(hotels)}] {hotel['full_name']}")
|
||
|
||
# Получаем все страницы отеля
|
||
cur.execute('''
|
||
SELECT url, html, page_title
|
||
FROM hotel_website_raw
|
||
WHERE hotel_id = %s
|
||
ORDER BY depth, crawled_at
|
||
''', (hotel['hotel_id'],))
|
||
|
||
pages = cur.fetchall()
|
||
|
||
# Обрабатываем каждую страницу
|
||
for page in pages:
|
||
# Очищаем HTML
|
||
cleaned_text = HTMLProcessor.clean_html(page['html'])
|
||
|
||
# Извлекаем данные
|
||
extracted_data = HTMLProcessor.extract_structured_data(cleaned_text)
|
||
|
||
# Проверяем есть ли уже
|
||
cur.execute('''
|
||
SELECT id FROM hotel_website_processed
|
||
WHERE hotel_id = %s AND url = %s
|
||
''', (hotel['hotel_id'], page['url']))
|
||
|
||
if cur.fetchone():
|
||
continue # Уже обработано
|
||
|
||
# Сохраняем в processed
|
||
cur.execute('''
|
||
INSERT INTO hotel_website_processed
|
||
(hotel_id, url, cleaned_text, extracted_data, text_length, processed_at)
|
||
VALUES (%s, %s, %s, %s, %s, %s)
|
||
''', (
|
||
hotel['hotel_id'],
|
||
page['url'],
|
||
cleaned_text,
|
||
Json(extracted_data),
|
||
len(cleaned_text),
|
||
datetime.now()
|
||
))
|
||
|
||
conn.commit()
|
||
logger.info(f" ✓ Обработано {len(pages)} страниц")
|
||
processed += 1
|
||
|
||
logger.info(f"\n{'='*70}")
|
||
logger.info(f"✅ ГОТОВО! Обработано {processed} отелей")
|
||
logger.info(f"{'='*70}")
|
||
|
||
finally:
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
region = sys.argv[1] if len(sys.argv) > 1 else 'Камчатский край'
|
||
|
||
logger.info(f"📍 Регион: {region}")
|
||
process_region(region)
|
||
|