Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
200
process_raw_to_cleaned.py
Normal file
200
process_raw_to_cleaned.py
Normal file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Обработка сырого HTML в очищенный текст для аудита
|
||||
Из hotel_website_raw → hotel_website_processed
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, Json
|
||||
from urllib.parse import unquote
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Настройка логирования
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'process_raw_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Конфигурация БД
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
|
||||
class HTMLProcessor:
|
||||
"""Обработка HTML в чистый текст"""
|
||||
|
||||
@staticmethod
|
||||
def clean_html(html: str) -> str:
|
||||
"""Очистка HTML"""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Удаляем скрипты и стили
|
||||
for tag in soup.find_all(['script', 'style', 'noscript']):
|
||||
tag.decompose()
|
||||
|
||||
# Получаем текст
|
||||
text = soup.get_text()
|
||||
|
||||
# Очистка
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\n\s*\n', '\n', text)
|
||||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
@staticmethod
|
||||
def extract_structured_data(text: str) -> dict:
|
||||
"""Извлечение структурированных данных"""
|
||||
data = {
|
||||
'phones': [],
|
||||
'emails': [],
|
||||
'inns': [],
|
||||
'ogrn': [],
|
||||
'addresses': [],
|
||||
'prices': []
|
||||
}
|
||||
|
||||
# Телефоны
|
||||
phone_patterns = [
|
||||
r'\+?[78][\s\-\(\)]?\d{3}[\s\-\(\)]?\d{3}[\s\-\(\)]?\d{2}[\s\-\(\)]?\d{2}',
|
||||
r'8[\s\-]?800[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}'
|
||||
]
|
||||
for pattern in phone_patterns:
|
||||
data['phones'].extend(re.findall(pattern, text))
|
||||
|
||||
# Email
|
||||
data['emails'] = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
||||
|
||||
# ИНН
|
||||
inns = re.findall(r'\b\d{10,12}\b', text)
|
||||
data['inns'] = [inn for inn in inns if len(inn) in [10, 12]]
|
||||
|
||||
# ОГРН
|
||||
ogrns = re.findall(r'\b\d{13,15}\b', text)
|
||||
data['ogrn'] = [ogrn for ogrn in ogrns if len(ogrn) in [13, 15]]
|
||||
|
||||
# Цены (руб, ₽)
|
||||
data['prices'] = re.findall(r'(\d[\d\s]*\d)\s*(?:руб|₽|рублей)', text)
|
||||
|
||||
# Удаляем дубликаты
|
||||
for key in data:
|
||||
data[key] = list(set(data[key]))[:10] # Максимум 10 значений
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def process_region(region_name: str):
|
||||
"""Обработка региона"""
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
try:
|
||||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# Получаем сырые данные для обработки
|
||||
cur.execute('''
|
||||
SELECT DISTINCT w.hotel_id, h.full_name
|
||||
FROM hotel_website_raw w
|
||||
JOIN hotel_main h ON h.id = w.hotel_id
|
||||
WHERE h.region_name ILIKE %s
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM hotel_website_processed p
|
||||
WHERE p.hotel_id = w.hotel_id
|
||||
)
|
||||
ORDER BY h.full_name
|
||||
''', (f'%{region_name}%',))
|
||||
|
||||
hotels = cur.fetchall()
|
||||
|
||||
logger.info(f"\n{'='*70}")
|
||||
logger.info(f"🔄 ОБРАБОТКА СЫРЫХ ДАННЫХ: {region_name}")
|
||||
logger.info(f"📊 Отелей для обработки: {len(hotels)}")
|
||||
logger.info(f"{'='*70}\n")
|
||||
|
||||
if len(hotels) == 0:
|
||||
logger.info("✅ Все данные уже обработаны!")
|
||||
return
|
||||
|
||||
processed = 0
|
||||
|
||||
for i, hotel in enumerate(hotels, 1):
|
||||
logger.info(f"[{i}/{len(hotels)}] {hotel['full_name']}")
|
||||
|
||||
# Получаем все страницы отеля
|
||||
cur.execute('''
|
||||
SELECT url, html, page_title
|
||||
FROM hotel_website_raw
|
||||
WHERE hotel_id = %s
|
||||
ORDER BY depth, crawled_at
|
||||
''', (hotel['hotel_id'],))
|
||||
|
||||
pages = cur.fetchall()
|
||||
|
||||
# Обрабатываем каждую страницу
|
||||
for page in pages:
|
||||
# Очищаем HTML
|
||||
cleaned_text = HTMLProcessor.clean_html(page['html'])
|
||||
|
||||
# Извлекаем данные
|
||||
extracted_data = HTMLProcessor.extract_structured_data(cleaned_text)
|
||||
|
||||
# Проверяем есть ли уже
|
||||
cur.execute('''
|
||||
SELECT id FROM hotel_website_processed
|
||||
WHERE hotel_id = %s AND url = %s
|
||||
''', (hotel['hotel_id'], page['url']))
|
||||
|
||||
if cur.fetchone():
|
||||
continue # Уже обработано
|
||||
|
||||
# Сохраняем в processed
|
||||
cur.execute('''
|
||||
INSERT INTO hotel_website_processed
|
||||
(hotel_id, url, cleaned_text, extracted_data, text_length, processed_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
''', (
|
||||
hotel['hotel_id'],
|
||||
page['url'],
|
||||
cleaned_text,
|
||||
Json(extracted_data),
|
||||
len(cleaned_text),
|
||||
datetime.now()
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
logger.info(f" ✓ Обработано {len(pages)} страниц")
|
||||
processed += 1
|
||||
|
||||
logger.info(f"\n{'='*70}")
|
||||
logger.info(f"✅ ГОТОВО! Обработано {processed} отелей")
|
||||
logger.info(f"{'='*70}")
|
||||
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
region = sys.argv[1] if len(sys.argv) > 1 else 'Камчатский край'
|
||||
|
||||
logger.info(f"📍 Регион: {region}")
|
||||
process_region(region)
|
||||
|
||||
Reference in New Issue
Block a user