- Added comprehensive AI Assistant system (aiassist/ directory): * Vector search and embedding capabilities * Typebot proxy integration * Elastic search functionality * Message classification and chat history * MCP proxy for external integrations - Implemented Court Status API (GetCourtStatus.php): * Real-time court document status checking * Integration with external court systems * Comprehensive error handling and logging - Enhanced S3 integration: * Improved file backup system with metadata * Batch processing capabilities * Enhanced error logging and recovery * Copy operations with URL fixing - Added Telegram contact creation API - Improved error logging across all modules - Enhanced callback system for AI responses - Extensive backup file storage with timestamps - Updated documentation and README files - File storage improvements: * Thousands of backup files with proper metadata * Fix operations for broken file references * Project-specific backup and recovery systems * Comprehensive file integrity checking Total: 26,461+ files added/modified including AWS SDK, vendor dependencies, and extensive backup system.
65 lines
2.5 KiB
Python
65 lines
2.5 KiB
Python
import sys
|
||
import json
|
||
import re
|
||
from natasha import (
|
||
Segmenter, MorphVocab,
|
||
NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, NewsNERTagger,
|
||
Doc
|
||
)
|
||
|
||
# Инициализация инструментов Natasha
|
||
segmenter = Segmenter()
|
||
morph_vocab = MorphVocab()
|
||
emb = NewsEmbedding()
|
||
morph_tagger = NewsMorphTagger(emb)
|
||
syntax_parser = NewsSyntaxParser(emb)
|
||
ner_tagger = NewsNERTagger(emb)
|
||
|
||
def extract_entities(text):
|
||
doc = Doc(text)
|
||
doc.segment(segmenter)
|
||
doc.tag_morph(morph_tagger)
|
||
doc.parse_syntax(syntax_parser)
|
||
doc.tag_ner(ner_tagger)
|
||
|
||
entities = {
|
||
"Истец": None,
|
||
"Ответчик": None,
|
||
"Дата": None,
|
||
"Номер договора": None,
|
||
"Сумма оплаты": None,
|
||
"Общая стоимость": None
|
||
}
|
||
|
||
# Извлекаем ключевые сущности (имена, компании, даты, номера договоров)
|
||
for span in doc.spans:
|
||
span.normalize(morph_vocab)
|
||
if span.type == "PER" and not entities["Истец"]:
|
||
entities["Истец"] = span.text
|
||
elif span.type == "ORG" and not entities["Ответчик"]:
|
||
entities["Ответчик"] = span.text
|
||
elif span.type == "DATE":
|
||
entities["Дата"] = span.text
|
||
elif span.type == "NUM" and "договор" in text.lower():
|
||
entities["Номер договора"] = span.text
|
||
|
||
# Извлекаем суммы (стоимость, оплата)
|
||
price_pattern = r"(?:стоимость|оплата|цена|сумма)\s*(?:договор[ауе]|услуг[аи]?)?\s*:?[\s]*([\d\s]+[,.]?\d*)\s*(?:руб|₽|рублей|тыс|млн)?"
|
||
matches = re.findall(price_pattern, text, re.IGNORECASE)
|
||
|
||
if matches:
|
||
# Приводим суммы к единому формату (удаляем пробелы и заменяем запятые на точки)
|
||
prices = [re.sub(r"[^\d.]", "", match.replace(" ", "")) for match in matches]
|
||
if len(prices) == 1:
|
||
entities["Сумма оплаты"] = prices[0]
|
||
elif len(prices) > 1:
|
||
entities["Сумма оплаты"] = prices[0] # Берём первую сумму как "оплату"
|
||
entities["Общая стоимость"] = prices[-1] # Последняя сумма — общая стоимость
|
||
|
||
return entities
|
||
|
||
if __name__ == "__main__":
|
||
text = sys.argv[1]
|
||
extracted_entities = extract_entities(text)
|
||
print(json.dumps(extracted_entities, ensure_ascii=False))
|