🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
@@ -31,7 +31,7 @@ DB_CONFIG = {
|
||||
MAX_PAGES_PER_SITE = 15
|
||||
PAGE_TIMEOUT = 30000
|
||||
BATCH_SIZE = 50
|
||||
MAX_CONCURRENT = 10 # Увеличено с 3 до 10 для ускорения
|
||||
MAX_CONCURRENT = 3 # Уменьшено с 10 до 3 чтобы не грузить базу и браузер
|
||||
MAX_RETRIES = 2 # Максимум попыток для одного сайта
|
||||
|
||||
# Логирование
|
||||
@@ -102,7 +102,7 @@ def get_hotels_by_priority() -> List[Dict]:
|
||||
INNER JOIN stats s ON m.region_name = s.region_name
|
||||
WHERE m.website_address IS NOT NULL
|
||||
AND m.website_address != ''
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta)
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta WHERE crawl_status = 'completed')
|
||||
ORDER BY s.percent DESC, m.region_name, m.full_name
|
||||
""")
|
||||
|
||||
@@ -117,7 +117,7 @@ def get_hotels_by_priority() -> List[Dict]:
|
||||
FROM hotel_main m
|
||||
WHERE m.website_address IS NOT NULL
|
||||
AND m.website_address != ''
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta)
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta WHERE crawl_status = 'completed')
|
||||
AND m.region_name IN (
|
||||
'Краснодарский край',
|
||||
'г. Москва',
|
||||
@@ -146,7 +146,7 @@ def get_hotels_by_priority() -> List[Dict]:
|
||||
FROM hotel_main m
|
||||
WHERE m.website_address IS NOT NULL
|
||||
AND m.website_address != ''
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta)
|
||||
AND m.id NOT IN (SELECT hotel_id FROM hotel_website_meta WHERE crawl_status = 'completed')
|
||||
AND m.region_name NOT IN (
|
||||
SELECT DISTINCT region_name
|
||||
FROM (
|
||||
@@ -431,6 +431,7 @@ async def main():
|
||||
|
||||
processed = 0
|
||||
success = 0
|
||||
browser_restarts = 0
|
||||
|
||||
# Обрабатываем пачками
|
||||
for i in range(0, total, BATCH_SIZE):
|
||||
@@ -439,6 +440,14 @@ async def main():
|
||||
logger.info(f"\n📦 Пачка {i//BATCH_SIZE + 1}/{(total + BATCH_SIZE - 1)//BATCH_SIZE}")
|
||||
logger.info(f" Отели {i+1}-{min(i+BATCH_SIZE, total)} из {total}")
|
||||
|
||||
# Перезапускаем браузер каждые 1000 отелей (20 пачек) чтобы избежать утечек памяти
|
||||
if processed > 0 and processed % 1000 == 0:
|
||||
logger.info(f"🔄 Перезапуск браузера после {processed} отелей...")
|
||||
await browser.close()
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
browser_restarts += 1
|
||||
logger.info(f"✅ Браузер перезапущен (рестарт #{browser_restarts})")
|
||||
|
||||
tasks = [crawl_hotel(hotel, semaphore, browser) for hotel in batch]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user