✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
269 lines
9.2 KiB
Python
Executable File
269 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Повторная обработка неудачных отелей СПб через n8n webhook
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
import requests
|
||
import json
|
||
import time
|
||
from urllib.parse import unquote
|
||
|
||
# Конфигурация
|
||
DB_CONFIG = {
|
||
'host': '147.45.189.234',
|
||
'port': 5432,
|
||
'database': 'default_db',
|
||
'user': 'gen_user',
|
||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
}
|
||
|
||
N8N_WEBHOOK_URL = "https://n8n.clientright.pro/webhook/6be4a7b9-a016-4252-841f-0ebca367914f"
|
||
REGION = 'г. Санкт-Петербург'
|
||
AUDIT_VERSION = 'v1.0_with_rkn'
|
||
|
||
def get_failed_hotels():
|
||
"""Получить список отелей, которые упали в ошибку"""
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
# Все отели с чанками
|
||
cur.execute("""
|
||
SELECT DISTINCT metadata->>'hotel_id' as hotel_id
|
||
FROM hotel_website_chunks
|
||
WHERE metadata->>'hotel_id' IN (
|
||
SELECT id::text FROM hotel_main WHERE region_name = %s
|
||
)
|
||
""", (REGION,))
|
||
|
||
all_hotels = {row['hotel_id'] for row in cur.fetchall()}
|
||
|
||
# Уже обработанные
|
||
cur.execute("""
|
||
SELECT hotel_id::text
|
||
FROM hotel_audit_results
|
||
WHERE region_name = %s AND audit_version = %s
|
||
""", (REGION, AUDIT_VERSION))
|
||
|
||
processed_hotels = {row['hotel_id'] for row in cur.fetchall()}
|
||
|
||
# Неудачники
|
||
failed_hotel_ids = list(all_hotels - processed_hotels)
|
||
|
||
# Получаем полную информацию о неудачниках
|
||
cur.execute("""
|
||
SELECT
|
||
hm.id,
|
||
hm.full_name as hotel_name,
|
||
hm.website_address,
|
||
hm.region_name,
|
||
hm.rkn_registry_number as registry_number
|
||
FROM hotel_main hm
|
||
WHERE hm.id::text = ANY(%s)
|
||
ORDER BY hm.full_name
|
||
""", (failed_hotel_ids,))
|
||
|
||
hotels = cur.fetchall()
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return hotels
|
||
|
||
def get_hotel_chunks(hotel_id):
|
||
"""Получить чанки для отеля"""
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
cur.execute("""
|
||
SELECT text as chunk_text, metadata
|
||
FROM hotel_website_chunks
|
||
WHERE metadata->>'hotel_id' = %s
|
||
ORDER BY (metadata->>'page_number')::int, (metadata->>'chunk_index')::int
|
||
""", (str(hotel_id),))
|
||
|
||
chunks = cur.fetchall()
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return chunks
|
||
|
||
def audit_hotel_via_webhook(hotel, chunks):
|
||
"""Отправить отель на аудит через n8n webhook"""
|
||
payload = {
|
||
"hotel_id": str(hotel['id']),
|
||
"hotel_name": hotel['hotel_name'],
|
||
"website": hotel['website_address'] or "",
|
||
"region_name": hotel['region_name'],
|
||
"registry_number": hotel['registry_number'] or "",
|
||
"chunks": [
|
||
{
|
||
"text": chunk['chunk_text'],
|
||
"metadata": chunk['metadata']
|
||
}
|
||
for chunk in chunks
|
||
]
|
||
}
|
||
|
||
try:
|
||
response = requests.post(
|
||
N8N_WEBHOOK_URL,
|
||
json=payload,
|
||
timeout=180 # 3 минуты таймаут (для больших отелей)
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
try:
|
||
result = response.json()
|
||
# n8n может вернуть массив или объект
|
||
if isinstance(result, list) and len(result) > 0:
|
||
result = result[0]
|
||
# Проверяем наличие нужных полей
|
||
if not isinstance(result, dict):
|
||
return False, f"Response is not a dict: {type(result)}, content: {str(result)[:200]}"
|
||
|
||
# Преобразуем структуру n8n в структуру для БД
|
||
if 'found' in result and 'total_criteria' in result:
|
||
# Новый формат от n8n
|
||
result['total_score'] = result.get('found', 0)
|
||
result['max_score'] = result.get('total_criteria', 17)
|
||
result['score_percentage'] = result.get('compliance_percentage', 0.0)
|
||
# Преобразуем criteria в criteria_results
|
||
if 'criteria' in result:
|
||
result['criteria_results'] = result['criteria']
|
||
|
||
if 'total_score' not in result:
|
||
return False, f"Missing required fields in response: {json.dumps(result, ensure_ascii=False)[:200]}"
|
||
return True, result
|
||
except json.JSONDecodeError:
|
||
return False, f"Invalid JSON response: {response.text[:100]}"
|
||
else:
|
||
return False, f"HTTP {response.status_code}: {response.text[:100]}"
|
||
|
||
except requests.exceptions.Timeout:
|
||
return False, "Timeout after 180 seconds"
|
||
except Exception as e:
|
||
return False, str(e)
|
||
|
||
def save_audit_result(hotel, audit_result):
|
||
"""Сохранить результат аудита"""
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor()
|
||
|
||
try:
|
||
cur.execute("""
|
||
INSERT INTO hotel_audit_results (
|
||
hotel_id,
|
||
hotel_name,
|
||
website,
|
||
region_name,
|
||
total_score,
|
||
max_score,
|
||
score_percentage,
|
||
criteria_results,
|
||
audit_version
|
||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (hotel_id, audit_version)
|
||
DO UPDATE SET
|
||
total_score = EXCLUDED.total_score,
|
||
max_score = EXCLUDED.max_score,
|
||
score_percentage = EXCLUDED.score_percentage,
|
||
criteria_results = EXCLUDED.criteria_results
|
||
""", (
|
||
hotel['id'],
|
||
hotel['hotel_name'],
|
||
hotel['website_address'],
|
||
hotel['region_name'],
|
||
audit_result['total_score'],
|
||
audit_result['max_score'],
|
||
audit_result['score_percentage'],
|
||
json.dumps(audit_result['criteria_results'], ensure_ascii=False),
|
||
AUDIT_VERSION
|
||
))
|
||
|
||
conn.commit()
|
||
return True
|
||
except Exception as e:
|
||
conn.rollback()
|
||
print(f" ❌ Ошибка сохранения: {e}")
|
||
return False
|
||
finally:
|
||
cur.close()
|
||
conn.close()
|
||
|
||
def main():
|
||
print("🔄 ПОВТОРНАЯ ОБРАБОТКА НЕУДАЧНЫХ ОТЕЛЕЙ СПб")
|
||
print("=" * 60)
|
||
|
||
# Получаем неудачников
|
||
failed_hotels = get_failed_hotels()
|
||
total = len(failed_hotels)
|
||
|
||
print(f"📊 Найдено неудачных отелей: {total}")
|
||
print(f"🚀 Начинаем обработку через n8n webhook...")
|
||
print()
|
||
|
||
success_count = 0
|
||
error_count = 0
|
||
start_time = time.time()
|
||
|
||
for idx, hotel in enumerate(failed_hotels, 1):
|
||
print(f"[{idx}/{total}] {hotel['hotel_name']}")
|
||
print(f" 🔗 {hotel['website_address'] or 'Нет сайта'}")
|
||
|
||
# Получаем чанки
|
||
chunks = get_hotel_chunks(hotel['id'])
|
||
print(f" 📦 Chunks: {len(chunks)}")
|
||
|
||
if not chunks:
|
||
print(f" ⚠️ Нет чанков, пропускаем")
|
||
error_count += 1
|
||
continue
|
||
|
||
# Отправляем на аудит
|
||
print(f" 🔍 Аудит: {hotel['hotel_name']}...")
|
||
success, result = audit_hotel_via_webhook(hotel, chunks)
|
||
|
||
if success:
|
||
# Сохраняем результат
|
||
if save_audit_result(hotel, result):
|
||
score = result['score_percentage']
|
||
print(f" ✅ Успех! Балл: {score:.1f}%")
|
||
success_count += 1
|
||
else:
|
||
print(f" ❌ Ошибка сохранения")
|
||
error_count += 1
|
||
else:
|
||
print(f" ❌ Ошибка: {result}")
|
||
error_count += 1
|
||
|
||
# Прогресс
|
||
if idx % 10 == 0:
|
||
elapsed = time.time() - start_time
|
||
speed = idx / elapsed
|
||
eta = (total - idx) / speed if speed > 0 else 0
|
||
print(f"\n 📊 Прогресс: {idx}/{total} ({idx/total*100:.1f}%)")
|
||
print(f" ⏱️ Скорость: {speed:.2f} отелей/сек")
|
||
print(f" 🎯 ETA: {eta/60:.0f} минут\n")
|
||
|
||
print()
|
||
|
||
# Небольшая задержка между запросами
|
||
time.sleep(0.5)
|
||
|
||
# Итоги
|
||
print("\n" + "=" * 60)
|
||
print("📊 ИТОГО:")
|
||
print(f" ✅ Успешно: {success_count}")
|
||
print(f" ❌ Ошибок: {error_count}")
|
||
print(f" 📝 Всего отелей обработано: {success_count}")
|
||
print(f" ⏱️ Время работы: {(time.time() - start_time)/60:.1f} минут")
|
||
print()
|
||
print("🎉 Повторная обработка завершена!")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|