Files
crm.clientright.ru/crm_extensions/pdf_court_parser.py
Fedor ac7467f0b4 Major CRM updates: AI Assistant, Court Status API, S3 integration improvements, and extensive file storage system
- Added comprehensive AI Assistant system (aiassist/ directory):
  * Vector search and embedding capabilities
  * Typebot proxy integration
  * Elastic search functionality
  * Message classification and chat history
  * MCP proxy for external integrations

- Implemented Court Status API (GetCourtStatus.php):
  * Real-time court document status checking
  * Integration with external court systems
  * Comprehensive error handling and logging

- Enhanced S3 integration:
  * Improved file backup system with metadata
  * Batch processing capabilities
  * Enhanced error logging and recovery
  * Copy operations with URL fixing

- Added Telegram contact creation API
- Improved error logging across all modules
- Enhanced callback system for AI responses
- Extensive backup file storage with timestamps
- Updated documentation and README files

- File storage improvements:
  * Thousands of backup files with proper metadata
  * Fix operations for broken file references
  * Project-specific backup and recovery systems
  * Comprehensive file integrity checking

Total: 26,461+ files added/modified including AWS SDK, vendor dependencies, and extensive backup system.
2025-10-16 11:17:21 +03:00

125 lines
4.0 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Парсер судебных документов из PDF файлов
Использование: python3 pdf_court_parser.py "путь_к_pdf_файлу"
"""
import json
import sys
import os
import subprocess
from court_document_parser import CourtDocumentParser
def extract_text_from_pdf(pdf_path):
"""Извлекает текст из PDF файла"""
try:
# Пробуем использовать pdftotext (poppler-utils)
result = subprocess.run(['pdftotext', '-layout', pdf_path, '-'],
capture_output=True, text=True, check=True)
return result.stdout
except (subprocess.CalledProcessError, FileNotFoundError):
try:
# Пробуем использовать pdfplumber
import pdfplumber
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return text
except ImportError:
try:
# Пробуем использовать PyPDF2
import PyPDF2
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except ImportError:
return None
def main():
"""Основная функция для парсинга PDF"""
# Создаем парсер
document_parser = CourtDocumentParser()
try:
# Получаем путь к PDF файлу из аргумента командной строки
if len(sys.argv) < 2:
error_result = {
'error': 'Не указан путь к PDF файлу',
'status': 'error'
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
return 1
pdf_path = sys.argv[1]
# Проверяем существование файла
if not os.path.exists(pdf_path):
error_result = {
'error': f'Файл не найден: {pdf_path}',
'status': 'error'
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
return 1
# Извлекаем текст из PDF
text = extract_text_from_pdf(pdf_path)
if text is None:
error_result = {
'error': 'Не удалось извлечь текст из PDF. Установите poppler-utils, pdfplumber или PyPDF2',
'status': 'error'
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
return 1
if not text.strip():
error_result = {
'error': 'PDF файл не содержит текста',
'status': 'error'
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
return 1
# Создаем объект документа
input_data = {"combinedText": text}
# Парсим документ
results = document_parser.parse_documents([input_data])
# Добавляем информацию о файле
results[0]['pdf_file'] = pdf_path
results[0]['extracted_text_length'] = len(text)
# Выводим результаты в stdout
print(json.dumps(results, ensure_ascii=False, indent=2))
return 0
except Exception as e:
error_result = {
'error': f'Ошибка обработки: {str(e)}',
'status': 'error'
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
return 1
if __name__ == '__main__':
sys.exit(main())