🚀 Full project sync: Hotels RAG & Audit System

 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
This commit is contained in:
Фёдор
2025-10-27 22:49:42 +03:00
parent 0cf3297290
commit 684fada337
94 changed files with 14891 additions and 911 deletions

112
test_parser_api.py Executable file
View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Тестовый клиент для Universal Parser API
"""
import requests
import json
# Конфигурация
API_URL = "http://localhost:8003"
API_KEY = "parser_2025_secret_key_a8f3d9c1b4e7"
def test_parse(url: str, extract_links: bool = False):
"""Тест парсинга страницы"""
print(""*80)
print(f"🔍 ТЕСТИРУЕМ ПАРСИНГ: {url}")
print(""*80)
print()
headers = {
"X-API-Key": API_KEY,
"Content-Type": "application/json"
}
payload = {
"url": url,
"wait_seconds": 5,
"extract_links": extract_links,
"screenshot": False,
"javascript_enabled": True
}
try:
print("📤 Отправляем запрос...")
response = requests.post(
f"{API_URL}/parse",
headers=headers,
json=payload,
timeout=60
)
if response.status_code == 200:
data = response.json()
print(f"✅ Успех!")
print()
print(f"📊 РЕЗУЛЬТАТЫ:")
print(f" Status Code: {data['status_code']}")
print(f" Title: {data['title']}")
print(f" Текст: {data['text_length']:,} символов")
print(f" Время: {data['parsing_time']}с")
print()
if data['success']:
print("📄 ПРЕВЬЮ КОНТЕНТА:")
print("-" * 80)
print(data['text'][:1000])
print("-" * 80)
if extract_links and data.get('links'):
print()
print(f"🔗 Найдено ссылок: {len(data['links'])}")
for i, link in enumerate(data['links'][:10], 1):
print(f" {i}. {link}")
if len(data['links']) > 10:
print(f" ... и ещё {len(data['links']) - 10}")
else:
print(f"❌ Ошибка: {data.get('error')}")
else:
print(f"❌ HTTP {response.status_code}")
print(response.text)
except Exception as e:
print(f"❌ Ошибка: {e}")
print()
print(""*80)
def test_health():
"""Тест health check"""
print("🏥 Проверка здоровья API...")
response = requests.get(f"{API_URL}/health")
if response.status_code == 200:
data = response.json()
print(f"✅ API работает: {data['status']}")
print(f" Версия: {data['version']}")
else:
print(f"❌ API недоступен")
if __name__ == "__main__":
# Тест 1: Health check
test_health()
print()
# Тест 2: Судебный сайт (с защитой)
test_parse(
"https://mos-sud.ru/312/cases/civil/details/7b8a110a-162d-4493-88b0-e505523c9935?uid=77MS0312-01-2025-002929-35&formType=fullForm",
extract_links=False
)
# Тест 3: Обычный сайт
print()
test_parse("https://example.com", extract_links=True)