✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
58 lines
1.7 KiB
Python
58 lines
1.7 KiB
Python
import psycopg2
|
||
from urllib.parse import unquote
|
||
|
||
conn = psycopg2.connect(
|
||
host='147.45.189.234',
|
||
port=5432,
|
||
database='default_db',
|
||
user='gen_user',
|
||
password=unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
)
|
||
cur = conn.cursor()
|
||
|
||
print("\n🔍 АНАЛИЗ ОСТАВШИХСЯ 67 ОТЕЛЕЙ:\n")
|
||
|
||
# Отели с сайтами но без эмбедингов
|
||
cur.execute("""
|
||
SELECT h.id, h.full_name, h.website_address
|
||
FROM hotel_main h
|
||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||
AND h.website_address IS NOT NULL
|
||
AND h.website_address != ''
|
||
AND h.id NOT IN (
|
||
SELECT (c.metadata->>'hotel_id')::uuid
|
||
FROM hotel_website_chunks c
|
||
WHERE c.embedding IS NOT NULL
|
||
)
|
||
LIMIT 10
|
||
""")
|
||
|
||
print("📋 Примеры отелей без эмбедингов:")
|
||
for row in cur.fetchall():
|
||
print(f" - {row[1][:50]}: {row[2]}")
|
||
|
||
# Есть ли у них данные в hotel_website_processed?
|
||
cur.execute("""
|
||
SELECT COUNT(DISTINCT p.hotel_id)
|
||
FROM hotel_website_processed p
|
||
JOIN hotel_main h ON p.hotel_id = h.id
|
||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||
AND h.id NOT IN (
|
||
SELECT (c.metadata->>'hotel_id')::uuid
|
||
FROM hotel_website_chunks c
|
||
WHERE c.embedding IS NOT NULL
|
||
)
|
||
""")
|
||
in_processed = cur.fetchone()[0]
|
||
|
||
print(f"\n📊 Из 67 отелей:")
|
||
print(f" ✅ Есть в hotel_website_processed: {in_processed}")
|
||
print(f" ❌ Нет в hotel_website_processed: {67 - in_processed}")
|
||
|
||
if in_processed > 0:
|
||
print(f"\n✅ Скрипт должен их обработать!")
|
||
else:
|
||
print(f"\n❌ У этих отелей не спарсились сайты - эмбединги невозможны")
|
||
|
||
conn.close()
|