2025-10-16 10:52:09 +03:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
FastAPI сервис для Natasha NER (Named Entity Recognition)
|
|
|
|
|
|
Извлекает организации, адреса, имена из текста
|
|
|
|
|
|
Для использования в n8n через HTTP Request
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from fastapi import FastAPI, HTTPException, Header, Depends
|
|
|
|
|
|
from fastapi.security import APIKeyHeader
|
|
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
from typing import List, Optional
|
|
|
|
|
|
import uvicorn
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
# Natasha для NER
|
|
|
|
|
|
from natasha import (
|
|
|
|
|
|
Segmenter,
|
|
|
|
|
|
MorphVocab,
|
|
|
|
|
|
NewsEmbedding,
|
|
|
|
|
|
NewsMorphTagger,
|
|
|
|
|
|
NewsSyntaxParser,
|
|
|
|
|
|
NewsNERTagger,
|
|
|
|
|
|
Doc
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI(
|
|
|
|
|
|
title="Natasha NER API",
|
|
|
|
|
|
description="Извлечение сущностей из русского текста",
|
|
|
|
|
|
version="1.0.0"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 🔐 API KEY для защиты доступа
|
|
|
|
|
|
API_KEY = "CH2BAYBYGYDDSWpaEd_CvJrH04DoVSGtZi_mah2nXbw"
|
|
|
|
|
|
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
|
|
|
|
|
|
|
|
|
|
|
|
def verify_api_key(api_key: str = Depends(api_key_header)) -> bool:
|
|
|
|
|
|
"""Проверка API ключа"""
|
|
|
|
|
|
if api_key is None or api_key != API_KEY:
|
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
|
status_code=401,
|
|
|
|
|
|
detail="Неверный или отсутствующий API ключ. Используйте заголовок X-API-Key"
|
|
|
|
|
|
)
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
# Инициализация Natasha при старте
|
|
|
|
|
|
print("🔧 Инициализация Natasha...")
|
|
|
|
|
|
print(f"🔐 API защищён ключом: {API_KEY[:10]}...")
|
|
|
|
|
|
segmenter = Segmenter()
|
|
|
|
|
|
morph_vocab = MorphVocab()
|
|
|
|
|
|
emb = NewsEmbedding()
|
|
|
|
|
|
morph_tagger = NewsMorphTagger(emb)
|
|
|
|
|
|
syntax_parser = NewsSyntaxParser(emb)
|
|
|
|
|
|
ner_tagger = NewsNERTagger(emb)
|
|
|
|
|
|
print("✅ Natasha готова!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NERRequest(BaseModel):
|
|
|
|
|
|
text: str
|
|
|
|
|
|
max_length: int = 5000 # Ограничение длины текста для производительности
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Entity(BaseModel):
|
|
|
|
|
|
type: str # ORG, PER, LOC
|
|
|
|
|
|
text: str
|
|
|
|
|
|
start: int
|
|
|
|
|
|
end: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NERResponse(BaseModel):
|
|
|
|
|
|
organizations: List[str] # ORG - организации
|
|
|
|
|
|
persons: List[str] # PER - люди
|
|
|
|
|
|
locations: List[str] # LOC - локации/адреса
|
|
|
|
|
|
entities: List[Entity] # Все сущности с позициями
|
|
|
|
|
|
total_entities: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/")
|
|
|
|
|
|
async def root():
|
|
|
|
|
|
"""Информация о сервисе"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"service": "Natasha NER API",
|
|
|
|
|
|
"version": "1.1.0",
|
|
|
|
|
|
"description": "Извлечение сущностей из русского текста",
|
|
|
|
|
|
"security": "Требуется API ключ в заголовке X-API-Key",
|
|
|
|
|
|
"endpoints": {
|
|
|
|
|
|
"/extract": "POST - извлечь сущности из текста (требует API ключ)",
|
|
|
|
|
|
"/extract_simple": "POST - упрощённое извлечение (требует API ключ)",
|
|
|
|
|
|
"/health": "GET - проверка здоровья сервиса (без ключа)"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/health")
|
|
|
|
|
|
async def health():
|
|
|
|
|
|
"""Проверка здоровья сервиса"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"status": "healthy",
|
|
|
|
|
|
"natasha": "ready"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/extract", response_model=NERResponse)
|
|
|
|
|
|
async def extract_entities(request: NERRequest, authenticated: bool = Depends(verify_api_key)):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Извлечение сущностей из текста (требует API ключ)
|
|
|
|
|
|
|
|
|
|
|
|
Возвращает:
|
|
|
|
|
|
- organizations: список названий организаций
|
|
|
|
|
|
- persons: список имён людей
|
|
|
|
|
|
- locations: список локаций/адресов
|
|
|
|
|
|
- entities: все сущности с позициями
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# Ограничиваем длину текста для производительности
|
|
|
|
|
|
text = request.text[:request.max_length]
|
|
|
|
|
|
|
|
|
|
|
|
# Обработка текста Natasha
|
|
|
|
|
|
doc = Doc(text)
|
|
|
|
|
|
doc.segment(segmenter)
|
|
|
|
|
|
doc.tag_morph(morph_tagger)
|
|
|
|
|
|
doc.parse_syntax(syntax_parser)
|
|
|
|
|
|
doc.tag_ner(ner_tagger)
|
|
|
|
|
|
|
|
|
|
|
|
# Извлекаем сущности
|
|
|
|
|
|
organizations = []
|
|
|
|
|
|
persons = []
|
|
|
|
|
|
locations = []
|
|
|
|
|
|
entities = []
|
|
|
|
|
|
|
|
|
|
|
|
for span in doc.spans:
|
|
|
|
|
|
entity = Entity(
|
|
|
|
|
|
type=span.type,
|
|
|
|
|
|
text=span.text,
|
|
|
|
|
|
start=span.start,
|
|
|
|
|
|
end=span.stop
|
|
|
|
|
|
)
|
|
|
|
|
|
entities.append(entity)
|
|
|
|
|
|
|
|
|
|
|
|
if span.type == 'ORG':
|
|
|
|
|
|
organizations.append(span.text)
|
|
|
|
|
|
elif span.type == 'PER':
|
|
|
|
|
|
persons.append(span.text)
|
|
|
|
|
|
elif span.type == 'LOC':
|
|
|
|
|
|
locations.append(span.text)
|
|
|
|
|
|
|
|
|
|
|
|
return NERResponse(
|
|
|
|
|
|
organizations=list(set(organizations)), # Уникальные
|
|
|
|
|
|
persons=list(set(persons)),
|
|
|
|
|
|
locations=list(set(locations)),
|
|
|
|
|
|
entities=entities,
|
|
|
|
|
|
total_entities=len(entities)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
raise HTTPException(status_code=500, detail=f"Ошибка NER: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/extract_simple")
|
|
|
|
|
|
async def extract_simple(request: NERRequest, authenticated: bool = Depends(verify_api_key)):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Упрощённое извлечение - только списки сущностей
|
|
|
|
|
|
Для удобного использования в n8n (требует API ключ)
|
|
|
|
|
|
С умной фильтрацией ложноположительных результатов
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
text = request.text[:request.max_length]
|
|
|
|
|
|
|
|
|
|
|
|
doc = Doc(text)
|
|
|
|
|
|
doc.segment(segmenter)
|
|
|
|
|
|
doc.tag_morph(morph_tagger)
|
|
|
|
|
|
doc.parse_syntax(syntax_parser)
|
|
|
|
|
|
doc.tag_ner(ner_tagger)
|
|
|
|
|
|
|
|
|
|
|
|
organizations = []
|
|
|
|
|
|
persons = []
|
|
|
|
|
|
locations = []
|
|
|
|
|
|
|
|
|
|
|
|
# Паттерны для фильтрации
|
|
|
|
|
|
org_keywords = ['ип', 'ооо', 'оао', 'зао', 'ао', 'пао', 'нао', 'ндо', 'гуп', 'муп', 'фгуп', 'гбу', 'мбу']
|
|
|
|
|
|
ignore_org_patterns = [
|
|
|
|
|
|
r'^\d+', # Начинается с цифр (адреса)
|
|
|
|
|
|
r'\+\d', # Содержит телефон
|
|
|
|
|
|
r'^[А-Яа-я]{1,2}\s', # Короткие слова (предлоги)
|
|
|
|
|
|
]
|
|
|
|
|
|
ignore_loc_words = ['нужен', 'нужна', 'нужно', 'требуется']
|
|
|
|
|
|
|
|
|
|
|
|
for span in doc.spans:
|
|
|
|
|
|
entity_text = span.text.strip()
|
|
|
|
|
|
entity_lower = entity_text.lower()
|
|
|
|
|
|
|
|
|
|
|
|
if span.type == 'ORG':
|
|
|
|
|
|
# Проверяем, что это действительно организация
|
|
|
|
|
|
is_valid_org = False
|
|
|
|
|
|
|
|
|
|
|
|
# Проверка 1: содержит ключевые слова юрлиц
|
|
|
|
|
|
if any(keyword in entity_lower for keyword in org_keywords):
|
|
|
|
|
|
is_valid_org = True
|
|
|
|
|
|
|
|
|
|
|
|
# Проверка 2: не содержит паттерны адресов/телефонов
|
|
|
|
|
|
import re
|
|
|
|
|
|
has_ignore_pattern = any(re.search(pattern, entity_text) for pattern in ignore_org_patterns)
|
|
|
|
|
|
|
|
|
|
|
|
if is_valid_org and not has_ignore_pattern:
|
|
|
|
|
|
organizations.append(entity_text)
|
|
|
|
|
|
|
|
|
|
|
|
elif span.type == 'PER':
|
|
|
|
|
|
persons.append(entity_text)
|
|
|
|
|
|
|
|
|
|
|
|
elif span.type == 'LOC':
|
|
|
|
|
|
# Фильтруем мусорные "локации"
|
|
|
|
|
|
if entity_lower not in ignore_loc_words and len(entity_text) > 2:
|
|
|
|
|
|
locations.append(entity_text)
|
|
|
|
|
|
|
|
|
|
|
|
# Уникальные значения
|
|
|
|
|
|
organizations = list(set(organizations))
|
|
|
|
|
|
persons = list(set(persons))
|
|
|
|
|
|
locations = list(set(locations))
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"organizations": organizations,
|
|
|
|
|
|
"persons": persons,
|
|
|
|
|
|
"locations": locations,
|
|
|
|
|
|
"has_organizations": len(organizations) > 0,
|
|
|
|
|
|
"has_persons": len(persons) > 0,
|
|
|
|
|
|
"has_locations": len(locations) > 0,
|
|
|
|
|
|
"total": len(organizations) + len(persons) + len(locations)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
raise HTTPException(status_code=500, detail=f"Ошибка NER: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
print("🚀 Запуск Natasha NER API на порту 8004...")
|
|
|
|
|
|
uvicorn.run(app, host="0.0.0.0", port=8004)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-10-27 22:49:42 +03:00
|
|
|
|
|