Files
hotels/audit_spb_to_excel.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

220 lines
8.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Аудит отелей Санкт-Петербурга через n8n webhook + сохранение в Excel
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import requests
import time
import json
from datetime import datetime
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
WEBHOOK_URL = "https://n8n.clientright.pro/webhook/6be4a7b9-a016-4252-841f-0ebca367914f"
def get_orel_hotels():
"""Получить отели Санкт-Петербурга с chunks и данными РКН"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
cur.execute("""
SELECT DISTINCT
h.id::text AS hotel_id,
h.full_name AS hotel_name,
h.region_name,
h.website_address,
h.rkn_registry_status,
h.rkn_registry_number,
h.rkn_registry_date,
h.rkn_checked_at,
COUNT(hwc.id) AS chunks_count
FROM hotel_main h
LEFT JOIN hotel_website_chunks hwc ON hwc.metadata->>'hotel_id' = h.id::text
WHERE h.region_name = 'г. Санкт-Петербург'
GROUP BY h.id, h.full_name, h.region_name, h.website_address,
h.rkn_registry_status, h.rkn_registry_number, h.rkn_registry_date, h.rkn_checked_at
ORDER BY h.full_name
""")
hotels = cur.fetchall()
cur.close()
conn.close()
return hotels
def save_audit_to_db(hotel_id: str, hotel_name: str, region: str, audit_result: dict):
"""Сохранить результаты аудита в БД"""
try:
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
# Формируем данные для сохранения
criteria_results = audit_result.get('criteria_results', [])
total_score = audit_result.get('found', 0)
max_score = audit_result.get('total_criteria', 17)
score_percentage = audit_result.get('compliance_percentage', 0)
website = audit_result.get('website', '')
has_website = bool(website and website != 'НЕТ САЙТА')
# Добавляем РКН данные в criteria_results для полноты
rkn_criterion = {
'criterion_id': 6,
'criterion_name': 'РКН Реестр',
'found': audit_result.get('rkn_status', '').lower() == 'found',
'rkn_status': audit_result.get('rkn_status'),
'rkn_number': audit_result.get('rkn_number'),
'rkn_date': audit_result.get('rkn_date')
}
# Вставляем РКН критерий на позицию 6 (после критерия 5)
criteria_with_rkn = criteria_results[:5] + [rkn_criterion] + criteria_results[5:]
# Сохраняем в БД (обновляем если уже есть)
cur.execute("""
INSERT INTO hotel_audit_results (
hotel_id, region_name, hotel_name, website, has_website,
criteria_results, total_score, max_score, score_percentage,
audit_version
) VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s, %s,
'v1.0_with_rkn'
)
ON CONFLICT (hotel_id, audit_version)
DO UPDATE SET
region_name = EXCLUDED.region_name,
hotel_name = EXCLUDED.hotel_name,
website = EXCLUDED.website,
has_website = EXCLUDED.has_website,
criteria_results = EXCLUDED.criteria_results,
total_score = EXCLUDED.total_score,
max_score = EXCLUDED.max_score,
score_percentage = EXCLUDED.score_percentage,
audit_date = CURRENT_TIMESTAMP
""", (
hotel_id, region, hotel_name, website, has_website,
json.dumps(criteria_with_rkn, ensure_ascii=False),
total_score, max_score, score_percentage
))
conn.commit()
cur.close()
conn.close()
print(f" 💾 Сохранено в БД")
except Exception as e:
print(f" ⚠️ Ошибка сохранения в БД: {e}")
def audit_hotel(hotel_id: str, hotel_name: str) -> dict:
"""Запустить аудит отеля через webhook"""
try:
print(f" 🔍 Аудит: {hotel_name[:50]}...")
response = requests.post(
WEBHOOK_URL,
json={"hotel_id": hotel_id},
timeout=400 # 6+ минут таймаут для обхода Nginx
)
if response.status_code == 200:
data = response.json()
print(f" ✅ Готово! Найдено: {data[0]['found']}/{data[0]['total_criteria']}")
return data[0]
else:
print(f" ❌ Ошибка {response.status_code}: {response.text[:200]}")
return None
except requests.Timeout:
print(f" ⏱️ Таймаут (>400 сек)")
return None
except Exception as e:
print(f" ❌ Ошибка: {e}")
return None
def main():
print("🚀 ЗАПУСК АУДИТА САНКТ-ПЕТЕРБУРГА\n" + "="*60)
# Получаем отели
hotels = get_orel_hotels()
print(f"📊 Найдено отелей Санкт-Петербурга: {len(hotels)}")
# Разделяем на отели с chunks и без
hotels_with_chunks = [h for h in hotels if h['chunks_count'] > 0]
hotels_without_chunks = [h for h in hotels if h['chunks_count'] == 0]
print(f"С chunks: {len(hotels_with_chunks)}")
print(f" ⚠️ Без chunks: {len(hotels_without_chunks)}")
if hotels_without_chunks:
print(f"\n⚠️ Отели БЕЗ chunks (будут пропущены):")
for hotel in hotels_without_chunks[:10]:
print(f" - {hotel['hotel_name']}")
if len(hotels_without_chunks) > 10:
print(f" ... и еще {len(hotels_without_chunks) - 10}")
if not hotels_with_chunks:
print("\n❌ Нет отелей с chunks для аудита")
return
print(f"\n🎯 Будет проаудировано: {len(hotels_with_chunks)} отелей\n")
print("🚀 Запускаю аудит...\n")
# Аудитируем только отели с chunks
results = []
success_count = 0
error_count = 0
for idx, hotel in enumerate(hotels_with_chunks, 1):
print(f"\n[{idx}/{len(hotels_with_chunks)}] {hotel['hotel_name']}")
print(f" 🔗 {hotel['website_address'] or 'НЕТ САЙТА'}")
print(f" 📦 Chunks: {hotel['chunks_count']}")
audit_result = audit_hotel(hotel['hotel_id'], hotel['hotel_name'])
if audit_result:
audit_result['website'] = hotel['website_address'] or 'НЕТ САЙТА'
# Добавляем данные РКН
audit_result['rkn_status'] = hotel.get('rkn_registry_status')
audit_result['rkn_number'] = hotel.get('rkn_registry_number')
audit_result['rkn_date'] = hotel.get('rkn_registry_date')
audit_result['rkn_checked_at'] = hotel.get('rkn_checked_at')
# Сохраняем в БД
save_audit_to_db(
hotel['hotel_id'],
hotel['hotel_name'],
hotel['region_name'],
audit_result
)
results.append(audit_result)
success_count += 1
else:
error_count += 1
# Небольшая задержка между запросами
if idx < len(hotels_with_chunks):
time.sleep(2)
# Статистика
print("\n" + "="*60)
print(f"📊 ИТОГО:")
print(f" ✅ Успешно: {success_count}")
print(f" ❌ Ошибок: {error_count}")
print(f" 📝 Всего отелей обработано: {len(results)}")
print(f" 💾 Результаты сохранены в таблицу hotel_audit_results")
print(f"\n🎉 Аудит завершен!")
if __name__ == "__main__":
main()