Files
hotels/check_remaining.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

58 lines
1.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import psycopg2
from urllib.parse import unquote
conn = psycopg2.connect(
host='147.45.189.234',
port=5432,
database='default_db',
user='gen_user',
password=unquote('2~~9_%5EkVsU%3F2%5CS')
)
cur = conn.cursor()
print("\n🔍 АНАЛИЗ ОСТАВШИХСЯ 67 ОТЕЛЕЙ:\n")
# Отели с сайтами но без эмбедингов
cur.execute("""
SELECT h.id, h.full_name, h.website_address
FROM hotel_main h
WHERE h.region_name = 'г. Санкт-Петербург'
AND h.website_address IS NOT NULL
AND h.website_address != ''
AND h.id NOT IN (
SELECT (c.metadata->>'hotel_id')::uuid
FROM hotel_website_chunks c
WHERE c.embedding IS NOT NULL
)
LIMIT 10
""")
print("📋 Примеры отелей без эмбедингов:")
for row in cur.fetchall():
print(f" - {row[1][:50]}: {row[2]}")
# Есть ли у них данные в hotel_website_processed?
cur.execute("""
SELECT COUNT(DISTINCT p.hotel_id)
FROM hotel_website_processed p
JOIN hotel_main h ON p.hotel_id = h.id
WHERE h.region_name = 'г. Санкт-Петербург'
AND h.id NOT IN (
SELECT (c.metadata->>'hotel_id')::uuid
FROM hotel_website_chunks c
WHERE c.embedding IS NOT NULL
)
""")
in_processed = cur.fetchone()[0]
print(f"\n📊 Из 67 отелей:")
print(f" ✅ Есть в hotel_website_processed: {in_processed}")
print(f" ❌ Нет в hotel_website_processed: {67 - in_processed}")
if in_processed > 0:
print(f"\n✅ Скрипт должен их обработать!")
else:
print(f"\nУ этих отелей не спарсились сайты - эмбединги невозможны")
conn.close()