195 lines
7.0 KiB
Python
195 lines
7.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Загрузка данных отелей в Graphiti для векторизации
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import httpx
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
from datetime import datetime
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'graphiti_upload_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
GRAPHITI_API = "http://185.197.75.249:9200/upload"
|
|||
|
|
PROXY = os.getenv('HTTP_PROXY', 'http://185.197.75.249:3128')
|
|||
|
|
RATE_LIMIT_DELAY = 1 # Задержка между загрузками
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def upload_to_graphiti(hotel_data: dict, pages_data: list, group_id: str) -> dict:
|
|||
|
|
"""Загрузка данных отеля в Graphiti"""
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Формируем текст для загрузки
|
|||
|
|
text_parts = []
|
|||
|
|
|
|||
|
|
# Заголовок с информацией об отеле
|
|||
|
|
header = f"""
|
|||
|
|
ОТЕЛЬ: {hotel_data['full_name']}
|
|||
|
|
РЕГИОН: {hotel_data['region_name']}
|
|||
|
|
САЙТ: {hotel_data['website_address']}
|
|||
|
|
ИНН: {hotel_data['owner_inn'] or 'не указан'}
|
|||
|
|
ТЕЛЕФОН: {hotel_data['phone'] or 'не указан'}
|
|||
|
|
EMAIL: {hotel_data['email'] or 'не указан'}
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
if hotel_data.get('rkn_registry_status') == 'found':
|
|||
|
|
header += f"РЕЕСТР РКН: ✅ Зарегистрирован ({hotel_data['rkn_registry_number']}, {hotel_data['rkn_registry_date']})\n"
|
|||
|
|
else:
|
|||
|
|
header += f"РЕЕСТР РКН: ❌ Не найден или неясен\n"
|
|||
|
|
|
|||
|
|
text_parts.append(header)
|
|||
|
|
|
|||
|
|
# Добавляем контент страниц (ограничиваем размер)
|
|||
|
|
total_chars = 0
|
|||
|
|
max_total_chars = 50000 # Максимум 50К символов на отель
|
|||
|
|
|
|||
|
|
for page in pages_data:
|
|||
|
|
if total_chars >= max_total_chars:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# Очищаем HTML
|
|||
|
|
soup = BeautifulSoup(page['html'], 'html.parser')
|
|||
|
|
for tag in soup.find_all(['script', 'style']):
|
|||
|
|
tag.decompose()
|
|||
|
|
|
|||
|
|
clean_text = soup.get_text()
|
|||
|
|
clean_text = ' '.join(clean_text.split()) # Убираем лишние пробелы
|
|||
|
|
|
|||
|
|
if len(clean_text) > 100: # Только если есть содержимое
|
|||
|
|
# Ограничиваем размер каждой страницы
|
|||
|
|
page_text = clean_text[:3000]
|
|||
|
|
text_parts.append(f"\n--- СТРАНИЦА: {page['url']} ---\n{page_text}")
|
|||
|
|
total_chars += len(page_text)
|
|||
|
|
|
|||
|
|
full_text = '\n\n'.join(text_parts)
|
|||
|
|
|
|||
|
|
# Финальное ограничение
|
|||
|
|
if len(full_text) > max_total_chars:
|
|||
|
|
full_text = full_text[:max_total_chars]
|
|||
|
|
|
|||
|
|
# Формируем запрос
|
|||
|
|
payload = {
|
|||
|
|
"group_id": group_id,
|
|||
|
|
"title": f"Отель: {hotel_data['full_name']} ({hotel_data['region_name']})",
|
|||
|
|
"content": full_text
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Отправляем в Graphiti (без прокси, т.к. локальный API)
|
|||
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|||
|
|
response = await client.post(GRAPHITI_API, json=payload)
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
result = response.json()
|
|||
|
|
logger.info(f" ✅ Загружено в Graphiti: {len(pages_data)} страниц")
|
|||
|
|
return {'success': True, 'result': result}
|
|||
|
|
else:
|
|||
|
|
logger.error(f" ✗ Ошибка Graphiti: {response.status_code}")
|
|||
|
|
return {'success': False, 'error': response.text}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка загрузки: {e}")
|
|||
|
|
return {'success': False, 'error': str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def main():
|
|||
|
|
"""Основная функция"""
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
region = sys.argv[1] if len(sys.argv) > 1 else 'Камчатский край'
|
|||
|
|
group_id = f"hotel_{region.lower().replace(' ', '_').replace('ский', '').replace('край', '').strip()}"
|
|||
|
|
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Получаем отели с данными
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT DISTINCT h.id, h.full_name, h.region_name, h.website_address,
|
|||
|
|
h.owner_inn, h.phone, h.email, h.rkn_registry_status,
|
|||
|
|
h.rkn_registry_number, h.rkn_registry_date
|
|||
|
|
FROM hotel_main h
|
|||
|
|
JOIN hotel_website_raw w ON h.id = w.hotel_id
|
|||
|
|
WHERE h.region_name ILIKE %s
|
|||
|
|
ORDER BY h.full_name
|
|||
|
|
''', (f'%{region}%',))
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"🚀 ЗАГРУЗКА В GRAPHITI: {region}")
|
|||
|
|
logger.info(f"📊 Отелей: {len(hotels)}")
|
|||
|
|
logger.info(f"🏷️ Group ID: {group_id}")
|
|||
|
|
logger.info(f"⏱️ Примерное время: {len(hotels) * RATE_LIMIT_DELAY / 60:.1f} минут")
|
|||
|
|
logger.info(f"{'='*70}\n")
|
|||
|
|
|
|||
|
|
successful = 0
|
|||
|
|
failed = 0
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
logger.info(f"\n[{i}/{len(hotels)}] {'='*40}")
|
|||
|
|
logger.info(f"🏨 {hotel['full_name']}")
|
|||
|
|
logger.info(f"🌐 {hotel['website_address']}")
|
|||
|
|
|
|||
|
|
# Получаем страницы отеля
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT url, html, page_title
|
|||
|
|
FROM hotel_website_raw
|
|||
|
|
WHERE hotel_id = %s
|
|||
|
|
ORDER BY depth, crawled_at
|
|||
|
|
''', (hotel['id'],))
|
|||
|
|
|
|||
|
|
pages = cur.fetchall()
|
|||
|
|
|
|||
|
|
logger.info(f" 📄 Страниц: {len(pages)}")
|
|||
|
|
|
|||
|
|
# Загружаем в Graphiti
|
|||
|
|
result = await upload_to_graphiti(hotel, pages, group_id)
|
|||
|
|
|
|||
|
|
if result['success']:
|
|||
|
|
successful += 1
|
|||
|
|
else:
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
# Задержка
|
|||
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|||
|
|
|
|||
|
|
# Итоги
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info("📊 ИТОГИ ЗАГРУЗКИ:")
|
|||
|
|
logger.info(f" ✅ Успешно: {successful}/{len(hotels)}")
|
|||
|
|
logger.info(f" ✗ Ошибки: {failed}/{len(hotels)}")
|
|||
|
|
logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|
|||
|
|
|