63 lines
2.6 KiB
Python
63 lines
2.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
from deeppavlov import build_model, configs
|
|||
|
|
|
|||
|
|
def group_entities(tokens, labels):
|
|||
|
|
entities = []
|
|||
|
|
current_entity = []
|
|||
|
|
current_type = None
|
|||
|
|
for token, label in zip(tokens, labels):
|
|||
|
|
if label.startswith("B-"):
|
|||
|
|
if current_entity:
|
|||
|
|
entities.append((" ".join(current_entity), current_type))
|
|||
|
|
current_entity = [token]
|
|||
|
|
current_type = label.split("-")[1] # например, PER или ORG
|
|||
|
|
elif label.startswith("I-") and current_entity and label.split("-")[1] == current_type:
|
|||
|
|
current_entity.append(token)
|
|||
|
|
else:
|
|||
|
|
if current_entity:
|
|||
|
|
entities.append((" ".join(current_entity), current_type))
|
|||
|
|
current_entity = []
|
|||
|
|
current_type = None
|
|||
|
|
if current_entity:
|
|||
|
|
entities.append((" ".join(current_entity), current_type))
|
|||
|
|
return entities
|
|||
|
|
|
|||
|
|
def extract_entities_dp(text):
|
|||
|
|
# Загружаем модель NER DeepPavlov
|
|||
|
|
ner_model = build_model(configs.ner.ner_rus_bert, download=True)
|
|||
|
|
result = ner_model([text])
|
|||
|
|
# Ожидаемый формат результата:
|
|||
|
|
# [
|
|||
|
|
# [ [ "Иванов", "Иван", "Иванович", "заключил", "договор", "с", "ООО", "Рога", "и", "Копыта" ] ],
|
|||
|
|
# [ [ "B-PER", "I-PER", "I-PER", "O", "O", "O", "B-ORG", "I-ORG", "I-ORG", "I-ORG" ] ]
|
|||
|
|
# ]
|
|||
|
|
tokens = result[0][0] if result and result[0] else []
|
|||
|
|
labels = result[1][0] if result and result[1] else []
|
|||
|
|
grouped = group_entities(tokens, labels)
|
|||
|
|
|
|||
|
|
# Извлекаем первую сущность типа PERSON как истца и первую сущность типа ORG как ответчика
|
|||
|
|
истец = next((entity for entity, typ in grouped if typ == "PER"), "Не определено")
|
|||
|
|
ответчик = next((entity for entity, typ in grouped if typ == "ORG"), "Не определено")
|
|||
|
|
суть_спора = text[:100] if text else "Не определено"
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"истец": истец,
|
|||
|
|
"ответчик": ответчик,
|
|||
|
|
"суть_спора": суть_спора
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print(json.dumps({
|
|||
|
|
"истец": "Не определено",
|
|||
|
|
"ответчик": "Не определено",
|
|||
|
|
"суть_спора": "Не определено"
|
|||
|
|
}, ensure_ascii=False))
|
|||
|
|
sys.exit(0)
|
|||
|
|
|
|||
|
|
input_text = " ".join(sys.argv[1:])
|
|||
|
|
entities = extract_entities_dp(input_text)
|
|||
|
|
print(json.dumps(entities, ensure_ascii=False, indent=2))
|