✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
303 lines
11 KiB
Python
303 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Тестирование с headless=true и максимальной маскировкой
|
||
"""
|
||
|
||
import asyncio
|
||
from playwright.async_api import async_playwright
|
||
from playwright_stealth import Stealth
|
||
import random
|
||
|
||
URL = "https://mos-sud.ru/312/cases/civil/details/7b8a110a-162d-4493-88b0-e505523c9935?uid=77MS0312-01-2025-002929-35&formType=fullForm"
|
||
|
||
async def test_method_1_stealth_advanced():
|
||
"""МЕТОД 1: Максимальная маскировка + Stealth"""
|
||
print("═"*80)
|
||
print("🥷 МЕТОД 1: МАКСИМАЛЬНАЯ МАСКИРОВКА + STEALTH")
|
||
print("═"*80)
|
||
|
||
try:
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(
|
||
headless=True,
|
||
args=[
|
||
'--disable-blink-features=AutomationControlled',
|
||
'--disable-dev-shm-usage',
|
||
'--no-sandbox',
|
||
'--disable-setuid-sandbox',
|
||
'--disable-web-security',
|
||
'--disable-features=site-per-process',
|
||
'--window-size=1920,1080',
|
||
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
]
|
||
)
|
||
|
||
context = await browser.new_context(
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
viewport={'width': 1920, 'height': 1080},
|
||
locale='ru-RU',
|
||
timezone_id='Europe/Moscow',
|
||
geolocation={'latitude': 55.7558, 'longitude': 37.6173},
|
||
permissions=['geolocation'],
|
||
extra_http_headers={
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'ru-RU,ru;q=0.9',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'DNT': '1',
|
||
'Connection': 'keep-alive',
|
||
'Upgrade-Insecure-Requests': '1'
|
||
}
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
# Применяем Stealth
|
||
stealth = Stealth()
|
||
await stealth.apply_stealth_async(page)
|
||
|
||
# Дополнительные скрипты
|
||
await page.add_init_script("""
|
||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
||
Object.defineProperty(navigator, 'languages', {get: () => ['ru-RU', 'ru']});
|
||
window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}};
|
||
""")
|
||
|
||
print(" 🌐 Загружаем страницу...")
|
||
response = await page.goto(URL, wait_until='domcontentloaded', timeout=30000)
|
||
await asyncio.sleep(7)
|
||
|
||
text = await page.inner_text('body')
|
||
status = response.status
|
||
|
||
await browser.close()
|
||
|
||
print(f" 📊 Статус: {status}")
|
||
print(f" 📝 Текст: {len(text)} символов")
|
||
print(f" 📄 Превью: {text[:150]}")
|
||
|
||
if status == 200 and len(text) > 100:
|
||
print(" ✅ УСПЕХ!")
|
||
return True, text
|
||
else:
|
||
print(f" ❌ Не сработало")
|
||
return False, text
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {str(e)[:100]}")
|
||
return False, None
|
||
|
||
async def test_method_2_firefox_headless():
|
||
"""МЕТОД 2: Firefox headless"""
|
||
print("═"*80)
|
||
print("🦊 МЕТОД 2: FIREFOX HEADLESS")
|
||
print("═"*80)
|
||
|
||
try:
|
||
async with async_playwright() as p:
|
||
browser = await p.firefox.launch(headless=True)
|
||
|
||
context = await browser.new_context(
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||
locale='ru-RU',
|
||
timezone_id='Europe/Moscow'
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
print(" 🌐 Загружаем через Firefox...")
|
||
response = await page.goto(URL, wait_until='networkidle', timeout=30000)
|
||
await asyncio.sleep(5)
|
||
|
||
text = await page.inner_text('body')
|
||
status = response.status
|
||
|
||
await browser.close()
|
||
|
||
print(f" 📊 Статус: {status}")
|
||
print(f" 📝 Текст: {len(text)} символов")
|
||
print(f" 📄 Превью: {text[:150]}")
|
||
|
||
if status == 200 and len(text) > 100:
|
||
print(" ✅ УСПЕХ!")
|
||
return True, text
|
||
else:
|
||
print(f" ❌ Не сработало")
|
||
return False, text
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {str(e)[:100]}")
|
||
return False, None
|
||
|
||
async def test_method_3_two_step():
|
||
"""МЕТОД 3: Двухшаговая загрузка"""
|
||
print("═"*80)
|
||
print("🪜 МЕТОД 3: ДВУХШАГОВАЯ ЗАГРУЗКА")
|
||
print("═"*80)
|
||
|
||
try:
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(
|
||
headless=True,
|
||
args=['--disable-blink-features=AutomationControlled']
|
||
)
|
||
|
||
context = await browser.new_context(
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
print(" 📍 Шаг 1: Главная страница...")
|
||
await page.goto('https://mos-sud.ru/', wait_until='networkidle', timeout=30000)
|
||
await asyncio.sleep(3)
|
||
|
||
print(" 📍 Шаг 2: Целевая страница...")
|
||
response = await page.goto(URL, wait_until='networkidle', timeout=30000)
|
||
await asyncio.sleep(7)
|
||
|
||
text = await page.inner_text('body')
|
||
status = response.status
|
||
|
||
await browser.close()
|
||
|
||
print(f" 📊 Статус: {status}")
|
||
print(f" 📝 Текст: {len(text)} символов")
|
||
print(f" 📄 Превью: {text[:150]}")
|
||
|
||
if status == 200 and len(text) > 100:
|
||
print(" ✅ УСПЕХ!")
|
||
return True, text
|
||
else:
|
||
print(f" ❌ Не сработало")
|
||
return False, text
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {str(e)[:100]}")
|
||
return False, None
|
||
|
||
async def test_method_4_webkit():
|
||
"""МЕТОД 4: WebKit (Safari engine)"""
|
||
print("═"*80)
|
||
print("🌐 МЕТОД 4: WEBKIT (Safari)")
|
||
print("═"*80)
|
||
|
||
try:
|
||
async with async_playwright() as p:
|
||
browser = await p.webkit.launch(headless=True)
|
||
|
||
context = await browser.new_context(
|
||
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
|
||
locale='ru-RU'
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
print(" 🌐 Загружаем через WebKit...")
|
||
response = await page.goto(URL, wait_until='domcontentloaded', timeout=30000)
|
||
await asyncio.sleep(5)
|
||
|
||
text = await page.inner_text('body')
|
||
status = response.status
|
||
|
||
await browser.close()
|
||
|
||
print(f" 📊 Статус: {status}")
|
||
print(f" 📝 Текст: {len(text)} символов")
|
||
print(f" 📄 Превью: {text[:150]}")
|
||
|
||
if status == 200 and len(text) > 100:
|
||
print(" ✅ УСПЕХ!")
|
||
return True, text
|
||
else:
|
||
print(f" ❌ Не сработало")
|
||
return False, text
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {str(e)[:100]}")
|
||
return False, None
|
||
|
||
async def main():
|
||
print("🥷"*40)
|
||
print()
|
||
print(" ТЕСТИРОВАНИЕ ОБХОДА ЗАЩИТЫ (HEADLESS MODE)")
|
||
print()
|
||
print("🥷"*40)
|
||
print()
|
||
|
||
methods = [
|
||
("Stealth + Маскировка", test_method_1_stealth_advanced),
|
||
("Firefox", test_method_2_firefox_headless),
|
||
("Двухшаговая загрузка", test_method_3_two_step),
|
||
("WebKit (Safari)", test_method_4_webkit),
|
||
]
|
||
|
||
results = {}
|
||
|
||
for name, method in methods:
|
||
print()
|
||
success, text = await method()
|
||
results[name] = {
|
||
'success': success,
|
||
'text': text
|
||
}
|
||
print()
|
||
await asyncio.sleep(2)
|
||
|
||
# Итоги
|
||
print("═"*80)
|
||
print("📊 ИТОГОВЫЕ РЕЗУЛЬТАТЫ")
|
||
print("═"*80)
|
||
print()
|
||
|
||
for name, result in results.items():
|
||
status = "✅ РАБОТАЕТ" if result['success'] else "❌ НЕ РАБОТАЕТ"
|
||
print(f" {name:30s} {status}")
|
||
|
||
print()
|
||
print("═"*80)
|
||
print()
|
||
|
||
# Если хоть один метод сработал
|
||
if any(r['success'] for r in results.values()):
|
||
print("🎉 НАЙДЕН РАБОЧИЙ МЕТОД!")
|
||
for name, result in results.items():
|
||
if result['success']:
|
||
print(f"\n✅ {name} - УСПЕШНО!")
|
||
print(f"\nКОНТЕНТ:\n{'-'*80}")
|
||
print(result['text'][:1000])
|
||
print('-'*80)
|
||
else:
|
||
print("💡 ВСЕ МЕТОДЫ ВЕРНУЛИ 403")
|
||
print()
|
||
print("Сайт mos-sud.ru имеет ОЧЕНЬ сильную защиту WAF.")
|
||
print()
|
||
print("🔐 ОСТАВШИЕСЯ ВАРИАНТЫ:")
|
||
print()
|
||
print(" 1. 🌐 Residential прокси ($50-200/мес)")
|
||
print(" - Выглядят как домашние пользователи")
|
||
print(" - Обходят 99% защит")
|
||
print()
|
||
print(" 2. 🔐 VPN через российский сервер")
|
||
print(" - Меняет IP на российский")
|
||
print(" - Может помочь с геоблокировкой")
|
||
print()
|
||
print(" 3. 🍪 Экспорт cookies из реального браузера")
|
||
print(" - Открыть сайт вручную")
|
||
print(" - Экспортировать cookies")
|
||
print(" - Использовать в парсере")
|
||
print()
|
||
print(" 4. 📧 Официальный API доступ")
|
||
print(" - Запросить у суда API ключ")
|
||
print(" - Для исследовательских целей")
|
||
print()
|
||
|
||
print("═"*80)
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|
||
|
||
|
||
|