🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
57
check_remaining.py
Normal file
57
check_remaining.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import psycopg2
|
||||
from urllib.parse import unquote
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host='147.45.189.234',
|
||||
port=5432,
|
||||
database='default_db',
|
||||
user='gen_user',
|
||||
password=unquote('2~~9_%5EkVsU%3F2%5CS')
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
print("\n🔍 АНАЛИЗ ОСТАВШИХСЯ 67 ОТЕЛЕЙ:\n")
|
||||
|
||||
# Отели с сайтами но без эмбедингов
|
||||
cur.execute("""
|
||||
SELECT h.id, h.full_name, h.website_address
|
||||
FROM hotel_main h
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
AND h.website_address IS NOT NULL
|
||||
AND h.website_address != ''
|
||||
AND h.id NOT IN (
|
||||
SELECT (c.metadata->>'hotel_id')::uuid
|
||||
FROM hotel_website_chunks c
|
||||
WHERE c.embedding IS NOT NULL
|
||||
)
|
||||
LIMIT 10
|
||||
""")
|
||||
|
||||
print("📋 Примеры отелей без эмбедингов:")
|
||||
for row in cur.fetchall():
|
||||
print(f" - {row[1][:50]}: {row[2]}")
|
||||
|
||||
# Есть ли у них данные в hotel_website_processed?
|
||||
cur.execute("""
|
||||
SELECT COUNT(DISTINCT p.hotel_id)
|
||||
FROM hotel_website_processed p
|
||||
JOIN hotel_main h ON p.hotel_id = h.id
|
||||
WHERE h.region_name = 'г. Санкт-Петербург'
|
||||
AND h.id NOT IN (
|
||||
SELECT (c.metadata->>'hotel_id')::uuid
|
||||
FROM hotel_website_chunks c
|
||||
WHERE c.embedding IS NOT NULL
|
||||
)
|
||||
""")
|
||||
in_processed = cur.fetchone()[0]
|
||||
|
||||
print(f"\n📊 Из 67 отелей:")
|
||||
print(f" ✅ Есть в hotel_website_processed: {in_processed}")
|
||||
print(f" ❌ Нет в hotel_website_processed: {67 - in_processed}")
|
||||
|
||||
if in_processed > 0:
|
||||
print(f"\n✅ Скрипт должен их обработать!")
|
||||
else:
|
||||
print(f"\n❌ У этих отелей не спарсились сайты - эмбединги невозможны")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user