241 lines
8.9 KiB
Python
241 lines
8.9 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Перепроверка отелей с неясным результатом РКН
|
|||
|
|
С улучшенным распознаванием разных форматов
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from playwright.async_api import async_playwright
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
from datetime import datetime
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
# Настройка логирования
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(f'rkn_recheck_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# Конфигурация БД
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
REQUEST_DELAY = 2
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def check_inn_improved(page, inn: str) -> dict:
|
|||
|
|
"""Улучшенная проверка ИНН с разными форматами"""
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
url = f'https://pd.rkn.gov.ru/operators-registry/operators-list/?act=search&inn={inn}'
|
|||
|
|
|
|||
|
|
logger.info(f" 🔍 Проверка ИНН: {inn}")
|
|||
|
|
|
|||
|
|
await asyncio.sleep(REQUEST_DELAY)
|
|||
|
|
|
|||
|
|
response = await page.goto(url, timeout=30000, wait_until='networkidle')
|
|||
|
|
|
|||
|
|
if response.status != 200:
|
|||
|
|
return {'found': False, 'status': 'error', 'message': f'HTTP {response.status}'}
|
|||
|
|
|
|||
|
|
await asyncio.sleep(1)
|
|||
|
|
|
|||
|
|
# Получаем HTML и текст
|
|||
|
|
html = await page.content()
|
|||
|
|
text = await page.evaluate('() => document.body.innerText')
|
|||
|
|
|
|||
|
|
# Проверка 1: Явное "Не найдено"
|
|||
|
|
if 'Не найдено' in text or 'не найдено' in text.lower():
|
|||
|
|
logger.info(f" ❌ Не найден в реестре")
|
|||
|
|
return {'found': False, 'status': 'not_found'}
|
|||
|
|
|
|||
|
|
# Проверка 2: Ищем регистрационный номер в разных форматах
|
|||
|
|
# Форматы: 41-14-000746, 10-0107355, 77-20-016698
|
|||
|
|
reg_patterns = [
|
|||
|
|
r'(\d{2}-\d{2}-\d{6,7})', # 41-14-000746
|
|||
|
|
r'(\d{2}-\d{4}-\d{6,7})', # 10-0107-355555
|
|||
|
|
r'href="\?id=([^"]+)"', # Из ссылки ?id=41-14-000746
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
reg_number = None
|
|||
|
|
for pattern in reg_patterns:
|
|||
|
|
match = re.search(pattern, html)
|
|||
|
|
if match:
|
|||
|
|
reg_number = match.group(1)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# Проверка 3: Ищем дату регистрации
|
|||
|
|
date_patterns = [
|
|||
|
|
r'Приказ[^0-9]*(\d{2}\.\d{2}\.\d{4})',
|
|||
|
|
r'(\d{2}\.\d{2}\.\d{4})', # Любая дата в формате ДД.ММ.ГГГГ
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
reg_date = None
|
|||
|
|
for pattern in date_patterns:
|
|||
|
|
match = re.search(pattern, text)
|
|||
|
|
if match:
|
|||
|
|
reg_date = match.group(1)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# Проверка 4: Ищем название организации с ИНН
|
|||
|
|
org_pattern = f'(?:Общество|Индивидуальный предприниматель|Акционерное общество|ОБЩЕСТВО|ООО|ИП|АО)[^<]*?ИНН:\\s*{inn}'
|
|||
|
|
org_match = re.search(org_pattern, html, re.IGNORECASE | re.DOTALL)
|
|||
|
|
|
|||
|
|
if org_match or reg_number:
|
|||
|
|
logger.info(f" ✅ Найден: {reg_number or 'номер не распознан'} ({reg_date or 'дата не распознана'})")
|
|||
|
|
return {
|
|||
|
|
'found': True,
|
|||
|
|
'status': 'found',
|
|||
|
|
'reg_number': reg_number,
|
|||
|
|
'reg_date': reg_date
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Проверка 5: Есть ли таблица с результатами?
|
|||
|
|
if 'class="TblList"' in html or 'id="ResList' in html:
|
|||
|
|
# Таблица есть, но не смогли распознать
|
|||
|
|
logger.info(f" ⚠️ Таблица найдена, но данные не распознаны")
|
|||
|
|
|
|||
|
|
# Сохраняем HTML для ручного анализа
|
|||
|
|
with open(f'rkn_unclear_{inn}.html', 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(html)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'found': None,
|
|||
|
|
'status': 'unclear',
|
|||
|
|
'message': 'Таблица найдена, но данные не распознаны',
|
|||
|
|
'html_saved': f'rkn_unclear_{inn}.html'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
logger.info(f" ❌ Результаты не найдены")
|
|||
|
|
return {'found': False, 'status': 'not_found'}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f" ✗ Ошибка: {e}")
|
|||
|
|
return {'found': False, 'status': 'error', 'message': str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def main():
|
|||
|
|
"""Основная функция"""
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
region = sys.argv[1] if len(sys.argv) > 1 else 'Камчатский край'
|
|||
|
|
|
|||
|
|
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Получаем отели с неясным результатом
|
|||
|
|
cur.execute('''
|
|||
|
|
SELECT id, full_name, owner_inn, website_address
|
|||
|
|
FROM hotel_main
|
|||
|
|
WHERE region_name ILIKE %s
|
|||
|
|
AND rkn_registry_status = 'unclear'
|
|||
|
|
ORDER BY full_name
|
|||
|
|
''', (f'%{region}%',))
|
|||
|
|
|
|||
|
|
hotels = cur.fetchall()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info(f"🔄 ПЕРЕПРОВЕРКА НЕЯСНЫХ РЕЗУЛЬТАТОВ: {region}")
|
|||
|
|
logger.info(f"📊 Отелей для перепроверки: {len(hotels)}")
|
|||
|
|
logger.info(f"⏱️ Примерное время: {len(hotels) * REQUEST_DELAY / 60:.1f} минут")
|
|||
|
|
logger.info(f"{'='*70}\n")
|
|||
|
|
|
|||
|
|
if len(hotels) == 0:
|
|||
|
|
logger.info("✅ Нет отелей для перепроверки!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Открываем браузер
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(headless=True)
|
|||
|
|
page = await browser.new_page()
|
|||
|
|
|
|||
|
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|||
|
|
await page.set_extra_http_headers({
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
results = {
|
|||
|
|
'found': 0,
|
|||
|
|
'not_found': 0,
|
|||
|
|
'still_unclear': 0,
|
|||
|
|
'error': 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for i, hotel in enumerate(hotels, 1):
|
|||
|
|
logger.info(f"\n[{i}/{len(hotels)}] {'='*50}")
|
|||
|
|
logger.info(f"🏨 {hotel['full_name']}")
|
|||
|
|
logger.info(f"🌐 {hotel['website_address']}")
|
|||
|
|
logger.info(f"🔢 ИНН: {hotel['owner_inn']}")
|
|||
|
|
|
|||
|
|
# Проверяем
|
|||
|
|
result = await check_inn_improved(page, hotel['owner_inn'])
|
|||
|
|
|
|||
|
|
# Сохраняем результат
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
cur.execute('''
|
|||
|
|
UPDATE hotel_main
|
|||
|
|
SET
|
|||
|
|
rkn_registry_status = %s,
|
|||
|
|
rkn_registry_number = %s,
|
|||
|
|
rkn_registry_date = %s,
|
|||
|
|
rkn_checked_at = %s
|
|||
|
|
WHERE id = %s
|
|||
|
|
''', (
|
|||
|
|
result['status'],
|
|||
|
|
result.get('reg_number'),
|
|||
|
|
result.get('reg_date'),
|
|||
|
|
datetime.now(),
|
|||
|
|
hotel['id']
|
|||
|
|
))
|
|||
|
|
conn.commit()
|
|||
|
|
cur.close()
|
|||
|
|
|
|||
|
|
# Обновляем статистику
|
|||
|
|
if result['found'] == True:
|
|||
|
|
results['found'] += 1
|
|||
|
|
elif result['found'] == False:
|
|||
|
|
if result['status'] == 'not_found':
|
|||
|
|
results['not_found'] += 1
|
|||
|
|
else:
|
|||
|
|
results['error'] += 1
|
|||
|
|
else:
|
|||
|
|
results['still_unclear'] += 1
|
|||
|
|
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
# Итоги
|
|||
|
|
logger.info(f"\n{'='*70}")
|
|||
|
|
logger.info("📊 ИТОГИ ПЕРЕПРОВЕРКИ:")
|
|||
|
|
logger.info(f" ✅ Теперь найдено: {results['found']}")
|
|||
|
|
logger.info(f" ❌ Не найдено: {results['not_found']}")
|
|||
|
|
logger.info(f" ❓ Все еще неясно: {results['still_unclear']}")
|
|||
|
|
logger.info(f" ⚠️ Ошибки: {results['error']}")
|
|||
|
|
logger.info(f"{'='*70}")
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|