🚀 Full project sync: Hotels RAG & Audit System

 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
This commit is contained in:
Фёдор
2025-10-27 22:49:42 +03:00
parent 0cf3297290
commit 684fada337
94 changed files with 14891 additions and 911 deletions

217
retry_spb_failed.py Executable file
View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
Перекраулинг failed отелей Питера с более мягкими настройками
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import logging
import sys
from datetime import datetime
import re
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'retry_spb_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler(sys.stdout)
]
)
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
def normalize_url(url):
"""Нормализовать URL"""
if not url:
return None
url = url.strip()
if not url.startswith(('http://', 'https://')):
# Попробуем сначала https
return f'https://{url}'
return url
def try_http_fallback(url):
"""Попробовать HTTP если HTTPS не работает"""
if url.startswith('https://'):
return url.replace('https://', 'http://')
return None
def crawl_hotel(hotel_id, hotel_name, website_address):
"""Краулинг одного отеля"""
url = normalize_url(website_address)
if not url:
logging.warning(f" ⚠️ Нет URL")
return False
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
ignore_https_errors=True, # Игнорировать SSL ошибки
java_script_enabled=True
)
page = context.new_page()
# Пробуем HTTPS
try:
logging.info(f" 🌐 Пробуем: {url}")
page.goto(url, wait_until='domcontentloaded', timeout=60000) # 60 секунд
html = page.content()
if html and len(html) > 100:
# Успешно!
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, url, html, datetime.now()))
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
conn.commit()
logging.info(f" ✅ Успешно! {len(html):,} байт")
browser.close()
cur.close()
conn.close()
return True
except Exception as e:
# Пробуем HTTP
http_url = try_http_fallback(url)
if http_url:
try:
logging.info(f" 🔄 Пробуем HTTP: {http_url}")
page.goto(http_url, wait_until='domcontentloaded', timeout=60000)
html = page.content()
if html and len(html) > 100:
cur.execute("DELETE FROM hotel_website_raw WHERE hotel_id = %s", (hotel_id,))
cur.execute("""
INSERT INTO hotel_website_raw (hotel_id, url, html, crawled_at)
VALUES (%s, %s, %s, %s)
""", (hotel_id, http_url, html, datetime.now()))
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, pages_crawled, total_size_bytes, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
pages_crawled = EXCLUDED.pages_crawled,
total_size_bytes = EXCLUDED.total_size_bytes,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at,
error_message = NULL
""", (hotel_id, 'completed', 1, len(html), datetime.now(), datetime.now()))
conn.commit()
logging.info(f" ✅ HTTP сработал! {len(html):,} байт")
browser.close()
cur.close()
conn.close()
return True
except Exception as e2:
logging.error(f" ❌ HTTP тоже не сработал: {str(e2)[:100]}")
raise e # Вернём оригинальную ошибку
else:
raise
browser.close()
except Exception as e:
error_msg = str(e)[:500]
logging.error(f" ❌ Ошибка: {error_msg}")
# Обновить статус как failed
cur.execute("""
INSERT INTO hotel_website_meta (hotel_id, crawl_status, error_message, crawl_started_at, crawl_finished_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
crawl_status = EXCLUDED.crawl_status,
error_message = EXCLUDED.error_message,
crawl_started_at = EXCLUDED.crawl_started_at,
crawl_finished_at = EXCLUDED.crawl_finished_at
""", (hotel_id, 'failed', error_msg, datetime.now(), datetime.now()))
conn.commit()
finally:
cur.close()
conn.close()
return False
def main():
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
# Получить failed отели
cur.execute("""
SELECT h.id, h.full_name, h.website_address
FROM hotel_main h
JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
WHERE h.region_name = 'г. Санкт-Петербург'
AND hwm.crawl_status = 'failed'
ORDER BY h.full_name
""")
hotels = cur.fetchall()
total = len(hotels)
cur.close()
conn.close()
logging.info("=" * 60)
logging.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ ПИТЕРА")
logging.info("=" * 60)
logging.info(f"Всего отелей: {total}")
logging.info("")
success = 0
failed = 0
for i, hotel in enumerate(hotels, 1):
logging.info(f"🏨 [{i}/{total}] {hotel['full_name']}")
if crawl_hotel(hotel['id'], hotel['full_name'], hotel['website_address']):
success += 1
else:
failed += 1
if i % 10 == 0:
logging.info(f" 📊 Прогресс: {success} успешно, {failed} ошибок")
logging.info("")
logging.info("=" * 60)
logging.info("🎉 ПЕРЕКРАУЛИНГ ЗАВЕРШЁН")
logging.info("=" * 60)
logging.info(f"✅ Успешно: {success}")
logging.info(f"❌ Ошибок: {failed}")
logging.info(f"📊 Успех: {success*100//total if total else 0}%")
if __name__ == '__main__':
main()