Files
crm.clientright.ru/natasha_extractor.py
Fedor ac7467f0b4 Major CRM updates: AI Assistant, Court Status API, S3 integration improvements, and extensive file storage system
- Added comprehensive AI Assistant system (aiassist/ directory):
  * Vector search and embedding capabilities
  * Typebot proxy integration
  * Elastic search functionality
  * Message classification and chat history
  * MCP proxy for external integrations

- Implemented Court Status API (GetCourtStatus.php):
  * Real-time court document status checking
  * Integration with external court systems
  * Comprehensive error handling and logging

- Enhanced S3 integration:
  * Improved file backup system with metadata
  * Batch processing capabilities
  * Enhanced error logging and recovery
  * Copy operations with URL fixing

- Added Telegram contact creation API
- Improved error logging across all modules
- Enhanced callback system for AI responses
- Extensive backup file storage with timestamps
- Updated documentation and README files

- File storage improvements:
  * Thousands of backup files with proper metadata
  * Fix operations for broken file references
  * Project-specific backup and recovery systems
  * Comprehensive file integrity checking

Total: 26,461+ files added/modified including AWS SDK, vendor dependencies, and extensive backup system.
2025-10-16 11:17:21 +03:00

65 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys
import json
import re
from natasha import (
Segmenter, MorphVocab,
NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, NewsNERTagger,
Doc
)
# Инициализация инструментов Natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
def extract_entities(text):
doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)
entities = {
"Истец": None,
"Ответчик": None,
"Дата": None,
"Номер договора": None,
"Сумма оплаты": None,
"Общая стоимость": None
}
# Извлекаем ключевые сущности (имена, компании, даты, номера договоров)
for span in doc.spans:
span.normalize(morph_vocab)
if span.type == "PER" and not entities["Истец"]:
entities["Истец"] = span.text
elif span.type == "ORG" and not entities["Ответчик"]:
entities["Ответчик"] = span.text
elif span.type == "DATE":
entities["Дата"] = span.text
elif span.type == "NUM" and "договор" in text.lower():
entities["Номер договора"] = span.text
# Извлекаем суммы (стоимость, оплата)
price_pattern = r"(?:стоимость|оплата|цена|сумма)\s*(?:договор[ауе]|услуг[аи]?)?\s*:?[\s]*([\d\s]+[,.]?\d*)\s*(?:руб|₽|рублей|тыс|млн)?"
matches = re.findall(price_pattern, text, re.IGNORECASE)
if matches:
# Приводим суммы к единому формату (удаляем пробелы и заменяем запятые на точки)
prices = [re.sub(r"[^\d.]", "", match.replace(" ", "")) for match in matches]
if len(prices) == 1:
entities["Сумма оплаты"] = prices[0]
elif len(prices) > 1:
entities["Сумма оплаты"] = prices[0] # Берём первую сумму как "оплату"
entities["Общая стоимость"] = prices[-1] # Последняя сумма — общая стоимость
return entities
if __name__ == "__main__":
text = sys.argv[1]
extracted_entities = extract_entities(text)
print(json.dumps(extracted_entities, ensure_ascii=False))