- Added comprehensive AI Assistant system (aiassist/ directory): * Vector search and embedding capabilities * Typebot proxy integration * Elastic search functionality * Message classification and chat history * MCP proxy for external integrations - Implemented Court Status API (GetCourtStatus.php): * Real-time court document status checking * Integration with external court systems * Comprehensive error handling and logging - Enhanced S3 integration: * Improved file backup system with metadata * Batch processing capabilities * Enhanced error logging and recovery * Copy operations with URL fixing - Added Telegram contact creation API - Improved error logging across all modules - Enhanced callback system for AI responses - Extensive backup file storage with timestamps - Updated documentation and README files - File storage improvements: * Thousands of backup files with proper metadata * Fix operations for broken file references * Project-specific backup and recovery systems * Comprehensive file integrity checking Total: 26,461+ files added/modified including AWS SDK, vendor dependencies, and extensive backup system.
125 lines
4.0 KiB
Python
Executable File
125 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Парсер судебных документов из PDF файлов
|
||
Использование: python3 pdf_court_parser.py "путь_к_pdf_файлу"
|
||
"""
|
||
|
||
import json
|
||
import sys
|
||
import os
|
||
import subprocess
|
||
from court_document_parser import CourtDocumentParser
|
||
|
||
|
||
def extract_text_from_pdf(pdf_path):
|
||
"""Извлекает текст из PDF файла"""
|
||
try:
|
||
# Пробуем использовать pdftotext (poppler-utils)
|
||
result = subprocess.run(['pdftotext', '-layout', pdf_path, '-'],
|
||
capture_output=True, text=True, check=True)
|
||
return result.stdout
|
||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||
try:
|
||
# Пробуем использовать pdfplumber
|
||
import pdfplumber
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
text = ""
|
||
for page in pdf.pages:
|
||
text += page.extract_text() or ""
|
||
return text
|
||
except ImportError:
|
||
try:
|
||
# Пробуем использовать PyPDF2
|
||
import PyPDF2
|
||
with open(pdf_path, 'rb') as file:
|
||
reader = PyPDF2.PdfReader(file)
|
||
text = ""
|
||
for page in reader.pages:
|
||
text += page.extract_text()
|
||
return text
|
||
except ImportError:
|
||
return None
|
||
|
||
|
||
def main():
|
||
"""Основная функция для парсинга PDF"""
|
||
|
||
# Создаем парсер
|
||
document_parser = CourtDocumentParser()
|
||
|
||
try:
|
||
# Получаем путь к PDF файлу из аргумента командной строки
|
||
if len(sys.argv) < 2:
|
||
error_result = {
|
||
'error': 'Не указан путь к PDF файлу',
|
||
'status': 'error'
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
return 1
|
||
|
||
pdf_path = sys.argv[1]
|
||
|
||
# Проверяем существование файла
|
||
if not os.path.exists(pdf_path):
|
||
error_result = {
|
||
'error': f'Файл не найден: {pdf_path}',
|
||
'status': 'error'
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
return 1
|
||
|
||
# Извлекаем текст из PDF
|
||
text = extract_text_from_pdf(pdf_path)
|
||
|
||
if text is None:
|
||
error_result = {
|
||
'error': 'Не удалось извлечь текст из PDF. Установите poppler-utils, pdfplumber или PyPDF2',
|
||
'status': 'error'
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
return 1
|
||
|
||
if not text.strip():
|
||
error_result = {
|
||
'error': 'PDF файл не содержит текста',
|
||
'status': 'error'
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
return 1
|
||
|
||
# Создаем объект документа
|
||
input_data = {"combinedText": text}
|
||
|
||
# Парсим документ
|
||
results = document_parser.parse_documents([input_data])
|
||
|
||
# Добавляем информацию о файле
|
||
results[0]['pdf_file'] = pdf_path
|
||
results[0]['extracted_text_length'] = len(text)
|
||
|
||
# Выводим результаты в stdout
|
||
print(json.dumps(results, ensure_ascii=False, indent=2))
|
||
|
||
return 0
|
||
|
||
except Exception as e:
|
||
error_result = {
|
||
'error': f'Ошибка обработки: {str(e)}',
|
||
'status': 'error'
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
return 1
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|