✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
220 lines
8.3 KiB
Python
220 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Аудит отелей Санкт-Петербурга через n8n webhook + сохранение в Excel
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from urllib.parse import unquote
|
||
import requests
|
||
import time
|
||
import json
|
||
from datetime import datetime
|
||
|
||
DB_CONFIG = {
|
||
'host': '147.45.189.234',
|
||
'port': 5432,
|
||
'database': 'default_db',
|
||
'user': 'gen_user',
|
||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
}
|
||
|
||
WEBHOOK_URL = "https://n8n.clientright.pro/webhook/6be4a7b9-a016-4252-841f-0ebca367914f"
|
||
|
||
def get_orel_hotels():
|
||
"""Получить отели Санкт-Петербурга с chunks и данными РКН"""
|
||
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||
cur = conn.cursor()
|
||
|
||
cur.execute("""
|
||
SELECT DISTINCT
|
||
h.id::text AS hotel_id,
|
||
h.full_name AS hotel_name,
|
||
h.region_name,
|
||
h.website_address,
|
||
h.rkn_registry_status,
|
||
h.rkn_registry_number,
|
||
h.rkn_registry_date,
|
||
h.rkn_checked_at,
|
||
COUNT(hwc.id) AS chunks_count
|
||
FROM hotel_main h
|
||
LEFT JOIN hotel_website_chunks hwc ON hwc.metadata->>'hotel_id' = h.id::text
|
||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||
GROUP BY h.id, h.full_name, h.region_name, h.website_address,
|
||
h.rkn_registry_status, h.rkn_registry_number, h.rkn_registry_date, h.rkn_checked_at
|
||
ORDER BY h.full_name
|
||
""")
|
||
|
||
hotels = cur.fetchall()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return hotels
|
||
|
||
def save_audit_to_db(hotel_id: str, hotel_name: str, region: str, audit_result: dict):
|
||
"""Сохранить результаты аудита в БД"""
|
||
try:
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor()
|
||
|
||
# Формируем данные для сохранения
|
||
criteria_results = audit_result.get('criteria_results', [])
|
||
total_score = audit_result.get('found', 0)
|
||
max_score = audit_result.get('total_criteria', 17)
|
||
score_percentage = audit_result.get('compliance_percentage', 0)
|
||
website = audit_result.get('website', '')
|
||
has_website = bool(website and website != 'НЕТ САЙТА')
|
||
|
||
# Добавляем РКН данные в criteria_results для полноты
|
||
rkn_criterion = {
|
||
'criterion_id': 6,
|
||
'criterion_name': 'РКН Реестр',
|
||
'found': audit_result.get('rkn_status', '').lower() == 'found',
|
||
'rkn_status': audit_result.get('rkn_status'),
|
||
'rkn_number': audit_result.get('rkn_number'),
|
||
'rkn_date': audit_result.get('rkn_date')
|
||
}
|
||
|
||
# Вставляем РКН критерий на позицию 6 (после критерия 5)
|
||
criteria_with_rkn = criteria_results[:5] + [rkn_criterion] + criteria_results[5:]
|
||
|
||
# Сохраняем в БД (обновляем если уже есть)
|
||
cur.execute("""
|
||
INSERT INTO hotel_audit_results (
|
||
hotel_id, region_name, hotel_name, website, has_website,
|
||
criteria_results, total_score, max_score, score_percentage,
|
||
audit_version
|
||
) VALUES (
|
||
%s, %s, %s, %s, %s,
|
||
%s, %s, %s, %s,
|
||
'v1.0_with_rkn'
|
||
)
|
||
ON CONFLICT (hotel_id, audit_version)
|
||
DO UPDATE SET
|
||
region_name = EXCLUDED.region_name,
|
||
hotel_name = EXCLUDED.hotel_name,
|
||
website = EXCLUDED.website,
|
||
has_website = EXCLUDED.has_website,
|
||
criteria_results = EXCLUDED.criteria_results,
|
||
total_score = EXCLUDED.total_score,
|
||
max_score = EXCLUDED.max_score,
|
||
score_percentage = EXCLUDED.score_percentage,
|
||
audit_date = CURRENT_TIMESTAMP
|
||
""", (
|
||
hotel_id, region, hotel_name, website, has_website,
|
||
json.dumps(criteria_with_rkn, ensure_ascii=False),
|
||
total_score, max_score, score_percentage
|
||
))
|
||
|
||
conn.commit()
|
||
cur.close()
|
||
conn.close()
|
||
print(f" 💾 Сохранено в БД")
|
||
|
||
except Exception as e:
|
||
print(f" ⚠️ Ошибка сохранения в БД: {e}")
|
||
|
||
def audit_hotel(hotel_id: str, hotel_name: str) -> dict:
|
||
"""Запустить аудит отеля через webhook"""
|
||
try:
|
||
print(f" 🔍 Аудит: {hotel_name[:50]}...")
|
||
|
||
response = requests.post(
|
||
WEBHOOK_URL,
|
||
json={"hotel_id": hotel_id},
|
||
timeout=400 # 6+ минут таймаут для обхода Nginx
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
print(f" ✅ Готово! Найдено: {data[0]['found']}/{data[0]['total_criteria']}")
|
||
return data[0]
|
||
else:
|
||
print(f" ❌ Ошибка {response.status_code}: {response.text[:200]}")
|
||
return None
|
||
|
||
except requests.Timeout:
|
||
print(f" ⏱️ Таймаут (>400 сек)")
|
||
return None
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {e}")
|
||
return None
|
||
|
||
def main():
|
||
print("🚀 ЗАПУСК АУДИТА САНКТ-ПЕТЕРБУРГА\n" + "="*60)
|
||
|
||
# Получаем отели
|
||
hotels = get_orel_hotels()
|
||
print(f"📊 Найдено отелей Санкт-Петербурга: {len(hotels)}")
|
||
|
||
# Разделяем на отели с chunks и без
|
||
hotels_with_chunks = [h for h in hotels if h['chunks_count'] > 0]
|
||
hotels_without_chunks = [h for h in hotels if h['chunks_count'] == 0]
|
||
|
||
print(f" ✅ С chunks: {len(hotels_with_chunks)}")
|
||
print(f" ⚠️ Без chunks: {len(hotels_without_chunks)}")
|
||
|
||
if hotels_without_chunks:
|
||
print(f"\n⚠️ Отели БЕЗ chunks (будут пропущены):")
|
||
for hotel in hotels_without_chunks[:10]:
|
||
print(f" - {hotel['hotel_name']}")
|
||
if len(hotels_without_chunks) > 10:
|
||
print(f" ... и еще {len(hotels_without_chunks) - 10}")
|
||
|
||
if not hotels_with_chunks:
|
||
print("\n❌ Нет отелей с chunks для аудита")
|
||
return
|
||
|
||
print(f"\n🎯 Будет проаудировано: {len(hotels_with_chunks)} отелей\n")
|
||
print("🚀 Запускаю аудит...\n")
|
||
|
||
# Аудитируем только отели с chunks
|
||
results = []
|
||
success_count = 0
|
||
error_count = 0
|
||
|
||
for idx, hotel in enumerate(hotels_with_chunks, 1):
|
||
print(f"\n[{idx}/{len(hotels_with_chunks)}] {hotel['hotel_name']}")
|
||
print(f" 🔗 {hotel['website_address'] or 'НЕТ САЙТА'}")
|
||
print(f" 📦 Chunks: {hotel['chunks_count']}")
|
||
|
||
audit_result = audit_hotel(hotel['hotel_id'], hotel['hotel_name'])
|
||
|
||
if audit_result:
|
||
audit_result['website'] = hotel['website_address'] or 'НЕТ САЙТА'
|
||
# Добавляем данные РКН
|
||
audit_result['rkn_status'] = hotel.get('rkn_registry_status')
|
||
audit_result['rkn_number'] = hotel.get('rkn_registry_number')
|
||
audit_result['rkn_date'] = hotel.get('rkn_registry_date')
|
||
audit_result['rkn_checked_at'] = hotel.get('rkn_checked_at')
|
||
|
||
# Сохраняем в БД
|
||
save_audit_to_db(
|
||
hotel['hotel_id'],
|
||
hotel['hotel_name'],
|
||
hotel['region_name'],
|
||
audit_result
|
||
)
|
||
|
||
results.append(audit_result)
|
||
success_count += 1
|
||
else:
|
||
error_count += 1
|
||
|
||
# Небольшая задержка между запросами
|
||
if idx < len(hotels_with_chunks):
|
||
time.sleep(2)
|
||
|
||
# Статистика
|
||
print("\n" + "="*60)
|
||
print(f"📊 ИТОГО:")
|
||
print(f" ✅ Успешно: {success_count}")
|
||
print(f" ❌ Ошибок: {error_count}")
|
||
print(f" 📝 Всего отелей обработано: {len(results)}")
|
||
print(f" 💾 Результаты сохранены в таблицу hotel_audit_results")
|
||
print(f"\n🎉 Аудит завершен!")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|