🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
112
test_parser_api.py
Executable file
112
test_parser_api.py
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Тестовый клиент для Universal Parser API
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
# Конфигурация
|
||||
API_URL = "http://localhost:8003"
|
||||
API_KEY = "parser_2025_secret_key_a8f3d9c1b4e7"
|
||||
|
||||
def test_parse(url: str, extract_links: bool = False):
|
||||
"""Тест парсинга страницы"""
|
||||
|
||||
print("═"*80)
|
||||
print(f"🔍 ТЕСТИРУЕМ ПАРСИНГ: {url}")
|
||||
print("═"*80)
|
||||
print()
|
||||
|
||||
headers = {
|
||||
"X-API-Key": API_KEY,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"url": url,
|
||||
"wait_seconds": 5,
|
||||
"extract_links": extract_links,
|
||||
"screenshot": False,
|
||||
"javascript_enabled": True
|
||||
}
|
||||
|
||||
try:
|
||||
print("📤 Отправляем запрос...")
|
||||
response = requests.post(
|
||||
f"{API_URL}/parse",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
print(f"✅ Успех!")
|
||||
print()
|
||||
print(f"📊 РЕЗУЛЬТАТЫ:")
|
||||
print(f" Status Code: {data['status_code']}")
|
||||
print(f" Title: {data['title']}")
|
||||
print(f" Текст: {data['text_length']:,} символов")
|
||||
print(f" Время: {data['parsing_time']}с")
|
||||
print()
|
||||
|
||||
if data['success']:
|
||||
print("📄 ПРЕВЬЮ КОНТЕНТА:")
|
||||
print("-" * 80)
|
||||
print(data['text'][:1000])
|
||||
print("-" * 80)
|
||||
|
||||
if extract_links and data.get('links'):
|
||||
print()
|
||||
print(f"🔗 Найдено ссылок: {len(data['links'])}")
|
||||
for i, link in enumerate(data['links'][:10], 1):
|
||||
print(f" {i}. {link}")
|
||||
if len(data['links']) > 10:
|
||||
print(f" ... и ещё {len(data['links']) - 10}")
|
||||
else:
|
||||
print(f"❌ Ошибка: {data.get('error')}")
|
||||
|
||||
else:
|
||||
print(f"❌ HTTP {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка: {e}")
|
||||
|
||||
print()
|
||||
print("═"*80)
|
||||
|
||||
|
||||
def test_health():
|
||||
"""Тест health check"""
|
||||
print("🏥 Проверка здоровья API...")
|
||||
response = requests.get(f"{API_URL}/health")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"✅ API работает: {data['status']}")
|
||||
print(f" Версия: {data['version']}")
|
||||
else:
|
||||
print(f"❌ API недоступен")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Тест 1: Health check
|
||||
test_health()
|
||||
print()
|
||||
|
||||
# Тест 2: Судебный сайт (с защитой)
|
||||
test_parse(
|
||||
"https://mos-sud.ru/312/cases/civil/details/7b8a110a-162d-4493-88b0-e505523c9935?uid=77MS0312-01-2025-002929-35&formType=fullForm",
|
||||
extract_links=False
|
||||
)
|
||||
|
||||
# Тест 3: Обычный сайт
|
||||
print()
|
||||
test_parse("https://example.com", extract_links=True)
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user