131 lines
5.0 KiB
Python
131 lines
5.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Массовая векторизация всех регионов без chunks
|
|||
|
|
Обрабатывает по приоритету: сначала маленькие, потом крупные
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
sys.path.insert(0, '/root/engine/public_oversight/hotels')
|
|||
|
|
|
|||
|
|
from process_all_hotels_embeddings import EmbeddingProcessor
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler('all_regions_embeddings.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
logger.info("🚀 Массовая векторизация всех регионов")
|
|||
|
|
|
|||
|
|
processor = EmbeddingProcessor()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Получаем ВСЕ отели без chunks по регионам
|
|||
|
|
processor.cur.execute("""
|
|||
|
|
WITH processed_stats AS (
|
|||
|
|
SELECT
|
|||
|
|
m.region_name,
|
|||
|
|
COUNT(DISTINCT p.hotel_id) as processed_hotels
|
|||
|
|
FROM hotel_website_processed p
|
|||
|
|
INNER JOIN hotel_main m ON p.hotel_id = m.id
|
|||
|
|
GROUP BY m.region_name
|
|||
|
|
),
|
|||
|
|
chunks_stats AS (
|
|||
|
|
SELECT
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as chunked_hotels
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE metadata->>'region_name' IS NOT NULL
|
|||
|
|
GROUP BY metadata->>'region_name'
|
|||
|
|
)
|
|||
|
|
SELECT
|
|||
|
|
p.hotel_id,
|
|||
|
|
m.full_name,
|
|||
|
|
m.region_name
|
|||
|
|
FROM hotel_website_processed p
|
|||
|
|
INNER JOIN hotel_main m ON p.hotel_id = m.id
|
|||
|
|
LEFT JOIN hotel_website_chunks c ON p.hotel_id::text = c.metadata->>'hotel_id'
|
|||
|
|
WHERE p.cleaned_text IS NOT NULL
|
|||
|
|
AND LENGTH(p.cleaned_text) > 50
|
|||
|
|
AND c.id IS NULL
|
|||
|
|
ORDER BY
|
|||
|
|
-- Сначала маленькие регионы
|
|||
|
|
(SELECT COUNT(*) FROM hotel_website_processed p2
|
|||
|
|
INNER JOIN hotel_main m2 ON p2.hotel_id = m2.id
|
|||
|
|
WHERE m2.region_name = m.region_name) ASC,
|
|||
|
|
m.region_name,
|
|||
|
|
m.full_name
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
hotels = processor.cur.fetchall()
|
|||
|
|
|
|||
|
|
logger.info(f"📊 Найдено отелей без chunks: {len(hotels)}")
|
|||
|
|
|
|||
|
|
if not hotels:
|
|||
|
|
logger.info("✅ Все отели уже обработаны!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Группируем по регионам для статистики
|
|||
|
|
regions_count = {}
|
|||
|
|
for _, _, region in hotels:
|
|||
|
|
regions_count[region] = regions_count.get(region, 0) + 1
|
|||
|
|
|
|||
|
|
logger.info(f"\n📍 Регионов к обработке: {len(regions_count)}")
|
|||
|
|
for region, count in sorted(regions_count.items(), key=lambda x: x[1]):
|
|||
|
|
logger.info(f" • {region}: {count} отелей")
|
|||
|
|
|
|||
|
|
# Обрабатываем
|
|||
|
|
successful = 0
|
|||
|
|
failed = 0
|
|||
|
|
current_region = None
|
|||
|
|
region_count = 0
|
|||
|
|
|
|||
|
|
for i, (hotel_id, hotel_name, region) in enumerate(hotels, 1):
|
|||
|
|
# Логируем смену региона
|
|||
|
|
if region != current_region:
|
|||
|
|
if current_region:
|
|||
|
|
logger.info(f"\n✅ Регион '{current_region}' завершён: {region_count} отелей")
|
|||
|
|
current_region = region
|
|||
|
|
region_count = 0
|
|||
|
|
logger.info(f"\n{'='*80}")
|
|||
|
|
logger.info(f"📍 Начинаю регион: {region}")
|
|||
|
|
logger.info(f"{'='*80}")
|
|||
|
|
|
|||
|
|
region_count += 1
|
|||
|
|
|
|||
|
|
logger.info(f"\n[{i}/{len(hotels)}] 🏨 {hotel_name[:50]}")
|
|||
|
|
logger.info(f" Регион: {region}")
|
|||
|
|
|
|||
|
|
if processor.process_hotel(hotel_id):
|
|||
|
|
successful += 1
|
|||
|
|
else:
|
|||
|
|
failed += 1
|
|||
|
|
|
|||
|
|
# Показываем прогресс каждые 10 отелей
|
|||
|
|
if i % 10 == 0:
|
|||
|
|
logger.info(f"\n📈 ОБЩИЙ ПРОГРЕСС: {i}/{len(hotels)} отелей")
|
|||
|
|
logger.info(f" ✅ Успешно: {successful}")
|
|||
|
|
logger.info(f" ❌ Ошибок: {failed}")
|
|||
|
|
logger.info(f" 📊 Success rate: {successful*100/i:.1f}%")
|
|||
|
|
|
|||
|
|
logger.info(f"\n🎉 ВСЯ ОБРАБОТКА ЗАВЕРШЕНА!")
|
|||
|
|
logger.info(f" ✅ Успешно: {successful}")
|
|||
|
|
logger.info(f" ❌ Ошибок: {failed}")
|
|||
|
|
logger.info(f" 📊 Итого обработано: {successful + failed}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"❌ Критическая ошибка: {e}")
|
|||
|
|
finally:
|
|||
|
|
processor.close()
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|
|||
|
|
|