- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
84 lines
3.1 KiB
Python
Executable File
84 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Скрипт для подсчёта отелей, готовых к аудиту
|
||
"""
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from urllib.parse import unquote
|
||
|
||
DB_CONFIG = {
|
||
'host': '147.45.189.234',
|
||
'port': 5432,
|
||
'database': 'default_db',
|
||
'user': 'gen_user',
|
||
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
}
|
||
|
||
def main():
|
||
conn = psycopg2.connect(**DB_CONFIG)
|
||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
|
||
print("📊 ГОТОВНОСТЬ ОТЕЛЕЙ К АУДИТУ\n" + "="*60)
|
||
|
||
# 1. Сколько отелей с chunks
|
||
cur.execute("""
|
||
SELECT COUNT(DISTINCT metadata->>'hotel_id') AS count
|
||
FROM hotel_website_chunks
|
||
WHERE metadata->>'hotel_id' IS NOT NULL
|
||
""")
|
||
hotels_with_chunks = cur.fetchone()['count']
|
||
|
||
# 2. Общее количество chunks
|
||
cur.execute("SELECT COUNT(*) AS count FROM hotel_website_chunks")
|
||
total_chunks = cur.fetchone()['count']
|
||
|
||
# 3. Средний размер chunks на отель
|
||
cur.execute("""
|
||
SELECT
|
||
ROUND(AVG(chunk_count), 0) AS avg_chunks
|
||
FROM (
|
||
SELECT COUNT(*) AS chunk_count
|
||
FROM hotel_website_chunks
|
||
WHERE metadata->>'hotel_id' IS NOT NULL
|
||
GROUP BY metadata->>'hotel_id'
|
||
) sub
|
||
""")
|
||
avg_chunks = cur.fetchone()['avg_chunks'] or 0
|
||
|
||
# 4. Топ-10 регионов по готовности
|
||
cur.execute("""
|
||
SELECT
|
||
h.region_name,
|
||
COUNT(DISTINCT hwc.metadata->>'hotel_id') AS hotels_ready,
|
||
COUNT(DISTINCT h.id) AS total_hotels,
|
||
ROUND(100.0 * COUNT(DISTINCT hwc.metadata->>'hotel_id') / COUNT(DISTINCT h.id), 1) AS percentage
|
||
FROM hotel_main h
|
||
LEFT JOIN hotel_website_chunks hwc ON hwc.metadata->>'hotel_id' = h.id::text
|
||
WHERE h.region_name IS NOT NULL
|
||
GROUP BY h.region_name
|
||
HAVING COUNT(DISTINCT hwc.metadata->>'hotel_id') > 0
|
||
ORDER BY hotels_ready DESC
|
||
LIMIT 10
|
||
""")
|
||
top_regions = cur.fetchall()
|
||
|
||
print(f"✅ Отелей с chunks (готовы к аудиту): {hotels_with_chunks:,}")
|
||
print(f"📦 Всего chunks в базе: {total_chunks:,}")
|
||
print(f"📊 Среднее chunks на отель: {avg_chunks}")
|
||
print(f"\n⏱️ ПРОГНОЗ ВРЕМЕНИ АУДИТА:")
|
||
print(f" • При 42 сек/отель: {hotels_with_chunks * 42 / 3600:.1f} часов ({hotels_with_chunks * 42 / 86400:.1f} дней)")
|
||
print(f" • При 20 сек/отель: {hotels_with_chunks * 20 / 3600:.1f} часов ({hotels_with_chunks * 20 / 86400:.1f} дней)")
|
||
print(f" • При 10 сек/отель: {hotels_with_chunks * 10 / 3600:.1f} часов ({hotels_with_chunks * 10 / 86400:.1f} дней)")
|
||
|
||
print(f"\n🏆 ТОП-10 РЕГИОНОВ ПО ГОТОВНОСТИ:\n{'-'*60}")
|
||
for region in top_regions:
|
||
print(f"{region['region']:<30} {region['hotels_ready']:>5} отелей ({region['percentage']:>5}%)")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|