🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
268
audit_spb_retry.py
Executable file
268
audit_spb_retry.py
Executable file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Повторная обработка неудачных отелей СПб через n8n webhook
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from urllib.parse import unquote
|
||||
|
||||
# Конфигурация
|
||||
DB_CONFIG = {
|
||||
'host': '147.45.189.234',
|
||||
'port': 5432,
|
||||
'database': 'default_db',
|
||||
'user': 'gen_user',
|
||||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||||
}
|
||||
|
||||
N8N_WEBHOOK_URL = "https://n8n.clientright.pro/webhook/6be4a7b9-a016-4252-841f-0ebca367914f"
|
||||
REGION = 'г. Санкт-Петербург'
|
||||
AUDIT_VERSION = 'v1.0_with_rkn'
|
||||
|
||||
def get_failed_hotels():
|
||||
"""Получить список отелей, которые упали в ошибку"""
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Все отели с чанками
|
||||
cur.execute("""
|
||||
SELECT DISTINCT metadata->>'hotel_id' as hotel_id
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'hotel_id' IN (
|
||||
SELECT id::text FROM hotel_main WHERE region_name = %s
|
||||
)
|
||||
""", (REGION,))
|
||||
|
||||
all_hotels = {row['hotel_id'] for row in cur.fetchall()}
|
||||
|
||||
# Уже обработанные
|
||||
cur.execute("""
|
||||
SELECT hotel_id::text
|
||||
FROM hotel_audit_results
|
||||
WHERE region_name = %s AND audit_version = %s
|
||||
""", (REGION, AUDIT_VERSION))
|
||||
|
||||
processed_hotels = {row['hotel_id'] for row in cur.fetchall()}
|
||||
|
||||
# Неудачники
|
||||
failed_hotel_ids = list(all_hotels - processed_hotels)
|
||||
|
||||
# Получаем полную информацию о неудачниках
|
||||
cur.execute("""
|
||||
SELECT
|
||||
hm.id,
|
||||
hm.full_name as hotel_name,
|
||||
hm.website_address,
|
||||
hm.region_name,
|
||||
hm.rkn_registry_number as registry_number
|
||||
FROM hotel_main hm
|
||||
WHERE hm.id::text = ANY(%s)
|
||||
ORDER BY hm.full_name
|
||||
""", (failed_hotel_ids,))
|
||||
|
||||
hotels = cur.fetchall()
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return hotels
|
||||
|
||||
def get_hotel_chunks(hotel_id):
|
||||
"""Получить чанки для отеля"""
|
||||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT text as chunk_text, metadata
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'hotel_id' = %s
|
||||
ORDER BY (metadata->>'page_number')::int, (metadata->>'chunk_index')::int
|
||||
""", (str(hotel_id),))
|
||||
|
||||
chunks = cur.fetchall()
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return chunks
|
||||
|
||||
def audit_hotel_via_webhook(hotel, chunks):
|
||||
"""Отправить отель на аудит через n8n webhook"""
|
||||
payload = {
|
||||
"hotel_id": str(hotel['id']),
|
||||
"hotel_name": hotel['hotel_name'],
|
||||
"website": hotel['website_address'] or "",
|
||||
"region_name": hotel['region_name'],
|
||||
"registry_number": hotel['registry_number'] or "",
|
||||
"chunks": [
|
||||
{
|
||||
"text": chunk['chunk_text'],
|
||||
"metadata": chunk['metadata']
|
||||
}
|
||||
for chunk in chunks
|
||||
]
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
N8N_WEBHOOK_URL,
|
||||
json=payload,
|
||||
timeout=180 # 3 минуты таймаут (для больших отелей)
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
result = response.json()
|
||||
# n8n может вернуть массив или объект
|
||||
if isinstance(result, list) and len(result) > 0:
|
||||
result = result[0]
|
||||
# Проверяем наличие нужных полей
|
||||
if not isinstance(result, dict):
|
||||
return False, f"Response is not a dict: {type(result)}, content: {str(result)[:200]}"
|
||||
|
||||
# Преобразуем структуру n8n в структуру для БД
|
||||
if 'found' in result and 'total_criteria' in result:
|
||||
# Новый формат от n8n
|
||||
result['total_score'] = result.get('found', 0)
|
||||
result['max_score'] = result.get('total_criteria', 17)
|
||||
result['score_percentage'] = result.get('compliance_percentage', 0.0)
|
||||
# Преобразуем criteria в criteria_results
|
||||
if 'criteria' in result:
|
||||
result['criteria_results'] = result['criteria']
|
||||
|
||||
if 'total_score' not in result:
|
||||
return False, f"Missing required fields in response: {json.dumps(result, ensure_ascii=False)[:200]}"
|
||||
return True, result
|
||||
except json.JSONDecodeError:
|
||||
return False, f"Invalid JSON response: {response.text[:100]}"
|
||||
else:
|
||||
return False, f"HTTP {response.status_code}: {response.text[:100]}"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return False, "Timeout after 180 seconds"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
def save_audit_result(hotel, audit_result):
|
||||
"""Сохранить результат аудита"""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO hotel_audit_results (
|
||||
hotel_id,
|
||||
hotel_name,
|
||||
website,
|
||||
region_name,
|
||||
total_score,
|
||||
max_score,
|
||||
score_percentage,
|
||||
criteria_results,
|
||||
audit_version
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (hotel_id, audit_version)
|
||||
DO UPDATE SET
|
||||
total_score = EXCLUDED.total_score,
|
||||
max_score = EXCLUDED.max_score,
|
||||
score_percentage = EXCLUDED.score_percentage,
|
||||
criteria_results = EXCLUDED.criteria_results
|
||||
""", (
|
||||
hotel['id'],
|
||||
hotel['hotel_name'],
|
||||
hotel['website_address'],
|
||||
hotel['region_name'],
|
||||
audit_result['total_score'],
|
||||
audit_result['max_score'],
|
||||
audit_result['score_percentage'],
|
||||
json.dumps(audit_result['criteria_results'], ensure_ascii=False),
|
||||
AUDIT_VERSION
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f" ❌ Ошибка сохранения: {e}")
|
||||
return False
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
def main():
|
||||
print("🔄 ПОВТОРНАЯ ОБРАБОТКА НЕУДАЧНЫХ ОТЕЛЕЙ СПб")
|
||||
print("=" * 60)
|
||||
|
||||
# Получаем неудачников
|
||||
failed_hotels = get_failed_hotels()
|
||||
total = len(failed_hotels)
|
||||
|
||||
print(f"📊 Найдено неудачных отелей: {total}")
|
||||
print(f"🚀 Начинаем обработку через n8n webhook...")
|
||||
print()
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
start_time = time.time()
|
||||
|
||||
for idx, hotel in enumerate(failed_hotels, 1):
|
||||
print(f"[{idx}/{total}] {hotel['hotel_name']}")
|
||||
print(f" 🔗 {hotel['website_address'] or 'Нет сайта'}")
|
||||
|
||||
# Получаем чанки
|
||||
chunks = get_hotel_chunks(hotel['id'])
|
||||
print(f" 📦 Chunks: {len(chunks)}")
|
||||
|
||||
if not chunks:
|
||||
print(f" ⚠️ Нет чанков, пропускаем")
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
# Отправляем на аудит
|
||||
print(f" 🔍 Аудит: {hotel['hotel_name']}...")
|
||||
success, result = audit_hotel_via_webhook(hotel, chunks)
|
||||
|
||||
if success:
|
||||
# Сохраняем результат
|
||||
if save_audit_result(hotel, result):
|
||||
score = result['score_percentage']
|
||||
print(f" ✅ Успех! Балл: {score:.1f}%")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f" ❌ Ошибка сохранения")
|
||||
error_count += 1
|
||||
else:
|
||||
print(f" ❌ Ошибка: {result}")
|
||||
error_count += 1
|
||||
|
||||
# Прогресс
|
||||
if idx % 10 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
speed = idx / elapsed
|
||||
eta = (total - idx) / speed if speed > 0 else 0
|
||||
print(f"\n 📊 Прогресс: {idx}/{total} ({idx/total*100:.1f}%)")
|
||||
print(f" ⏱️ Скорость: {speed:.2f} отелей/сек")
|
||||
print(f" 🎯 ETA: {eta/60:.0f} минут\n")
|
||||
|
||||
print()
|
||||
|
||||
# Небольшая задержка между запросами
|
||||
time.sleep(0.5)
|
||||
|
||||
# Итоги
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 ИТОГО:")
|
||||
print(f" ✅ Успешно: {success_count}")
|
||||
print(f" ❌ Ошибок: {error_count}")
|
||||
print(f" 📝 Всего отелей обработано: {success_count}")
|
||||
print(f" ⏱️ Время работы: {(time.time() - start_time)/60:.1f} минут")
|
||||
print()
|
||||
print("🎉 Повторная обработка завершена!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user