✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
113 lines
3.2 KiB
Python
Executable File
113 lines
3.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Тестовый клиент для Universal Parser API
|
||
"""
|
||
|
||
import requests
|
||
import json
|
||
|
||
# Конфигурация
|
||
API_URL = "http://localhost:8003"
|
||
API_KEY = "parser_2025_secret_key_a8f3d9c1b4e7"
|
||
|
||
def test_parse(url: str, extract_links: bool = False):
|
||
"""Тест парсинга страницы"""
|
||
|
||
print("═"*80)
|
||
print(f"🔍 ТЕСТИРУЕМ ПАРСИНГ: {url}")
|
||
print("═"*80)
|
||
print()
|
||
|
||
headers = {
|
||
"X-API-Key": API_KEY,
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
payload = {
|
||
"url": url,
|
||
"wait_seconds": 5,
|
||
"extract_links": extract_links,
|
||
"screenshot": False,
|
||
"javascript_enabled": True
|
||
}
|
||
|
||
try:
|
||
print("📤 Отправляем запрос...")
|
||
response = requests.post(
|
||
f"{API_URL}/parse",
|
||
headers=headers,
|
||
json=payload,
|
||
timeout=60
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
|
||
print(f"✅ Успех!")
|
||
print()
|
||
print(f"📊 РЕЗУЛЬТАТЫ:")
|
||
print(f" Status Code: {data['status_code']}")
|
||
print(f" Title: {data['title']}")
|
||
print(f" Текст: {data['text_length']:,} символов")
|
||
print(f" Время: {data['parsing_time']}с")
|
||
print()
|
||
|
||
if data['success']:
|
||
print("📄 ПРЕВЬЮ КОНТЕНТА:")
|
||
print("-" * 80)
|
||
print(data['text'][:1000])
|
||
print("-" * 80)
|
||
|
||
if extract_links and data.get('links'):
|
||
print()
|
||
print(f"🔗 Найдено ссылок: {len(data['links'])}")
|
||
for i, link in enumerate(data['links'][:10], 1):
|
||
print(f" {i}. {link}")
|
||
if len(data['links']) > 10:
|
||
print(f" ... и ещё {len(data['links']) - 10}")
|
||
else:
|
||
print(f"❌ Ошибка: {data.get('error')}")
|
||
|
||
else:
|
||
print(f"❌ HTTP {response.status_code}")
|
||
print(response.text)
|
||
|
||
except Exception as e:
|
||
print(f"❌ Ошибка: {e}")
|
||
|
||
print()
|
||
print("═"*80)
|
||
|
||
|
||
def test_health():
|
||
"""Тест health check"""
|
||
print("🏥 Проверка здоровья API...")
|
||
response = requests.get(f"{API_URL}/health")
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
print(f"✅ API работает: {data['status']}")
|
||
print(f" Версия: {data['version']}")
|
||
else:
|
||
print(f"❌ API недоступен")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Тест 1: Health check
|
||
test_health()
|
||
print()
|
||
|
||
# Тест 2: Судебный сайт (с защитой)
|
||
test_parse(
|
||
"https://mos-sud.ru/312/cases/civil/details/7b8a110a-162d-4493-88b0-e505523c9935?uid=77MS0312-01-2025-002929-35&formType=fullForm",
|
||
extract_links=False
|
||
)
|
||
|
||
# Тест 3: Обычный сайт
|
||
print()
|
||
test_parse("https://example.com", extract_links=True)
|
||
|
||
|
||
|
||
|