295 lines
9.8 KiB
Python
295 lines
9.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
API для семантического поиска по эмбеддингам
|
|||
|
|
Интеграция с веб-интерфейсом
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from fastapi import FastAPI, HTTPException
|
|||
|
|
from pydantic import BaseModel
|
|||
|
|
from typing import List, Dict, Optional
|
|||
|
|
import psycopg2
|
|||
|
|
from psycopg2.extras import RealDictCursor
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
import requests
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
app = FastAPI(
|
|||
|
|
title="Semantic Search API",
|
|||
|
|
description="API для семантического поиска по эмбеддингам отелей",
|
|||
|
|
version="1.0.0"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Конфигурация
|
|||
|
|
DB_CONFIG = {
|
|||
|
|
'host': "147.45.189.234",
|
|||
|
|
'port': 5432,
|
|||
|
|
'database': "default_db",
|
|||
|
|
'user': "gen_user",
|
|||
|
|
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
BGE_API_URL = "http://147.45.146.17:8002/embed"
|
|||
|
|
BGE_API_KEY = "22564b177aa73b6ac0b8642d7773350ff4c01d4983f028beff15ea247f09fa89"
|
|||
|
|
|
|||
|
|
class SearchRequest(BaseModel):
|
|||
|
|
query: str
|
|||
|
|
region: Optional[str] = None
|
|||
|
|
hotel_id: Optional[str] = None
|
|||
|
|
limit: int = 10
|
|||
|
|
min_distance: float = 0.3
|
|||
|
|
|
|||
|
|
class SearchResult(BaseModel):
|
|||
|
|
hotel_name: str
|
|||
|
|
region_name: str
|
|||
|
|
url: str
|
|||
|
|
text: str
|
|||
|
|
distance: float
|
|||
|
|
relevance: str
|
|||
|
|
|
|||
|
|
def get_db_connection():
|
|||
|
|
"""Получить подключение к БД"""
|
|||
|
|
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
|||
|
|
|
|||
|
|
def generate_query_embedding(query: str):
|
|||
|
|
"""Генерация эмбеддинга для поискового запроса"""
|
|||
|
|
try:
|
|||
|
|
headers = {
|
|||
|
|
"X-API-Key": BGE_API_KEY,
|
|||
|
|
"Content-Type": "application/json"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
payload = {"text": query}
|
|||
|
|
response = requests.post(BGE_API_URL, json=payload, headers=headers, timeout=30)
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
result = response.json()
|
|||
|
|
return result.get('embeddings', [[]])[0]
|
|||
|
|
else:
|
|||
|
|
raise HTTPException(status_code=500, detail=f"BGE API error: {response.status_code}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise HTTPException(status_code=500, detail=f"Embedding generation error: {str(e)}")
|
|||
|
|
|
|||
|
|
@app.get("/")
|
|||
|
|
async def root():
|
|||
|
|
"""Информация об API"""
|
|||
|
|
return {
|
|||
|
|
"name": "Semantic Search API",
|
|||
|
|
"version": "1.0.0",
|
|||
|
|
"description": "API для семантического поиска по эмбеддингам отелей",
|
|||
|
|
"endpoints": [
|
|||
|
|
"POST /search - Семантический поиск",
|
|||
|
|
"GET /regions - Список регионов",
|
|||
|
|
"GET /hotels - Список отелей",
|
|||
|
|
"GET /stats - Статистика"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@app.post("/search", response_model=List[SearchResult])
|
|||
|
|
async def semantic_search(request: SearchRequest):
|
|||
|
|
"""Семантический поиск по эмбеддингам"""
|
|||
|
|
try:
|
|||
|
|
# Генерируем эмбеддинг для запроса
|
|||
|
|
query_embedding = generate_query_embedding(request.query)
|
|||
|
|
embedding_str = json.dumps(query_embedding)
|
|||
|
|
|
|||
|
|
# Строим SQL запрос с фильтрами
|
|||
|
|
where_conditions = ["embedding IS NOT NULL"]
|
|||
|
|
params = []
|
|||
|
|
|
|||
|
|
if request.region:
|
|||
|
|
where_conditions.append("metadata->>'region_name' = %s")
|
|||
|
|
params.append(request.region)
|
|||
|
|
|
|||
|
|
if request.hotel_id:
|
|||
|
|
where_conditions.append("metadata->>'hotel_id' = %s")
|
|||
|
|
params.append(request.hotel_id)
|
|||
|
|
|
|||
|
|
where_clause = " AND ".join(where_conditions)
|
|||
|
|
|
|||
|
|
query = f"""
|
|||
|
|
SELECT
|
|||
|
|
metadata->>'hotel_name' as hotel_name,
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
metadata->>'url' as url,
|
|||
|
|
LEFT(text, 300) as text,
|
|||
|
|
embedding <-> %s::vector as distance
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE {where_clause}
|
|||
|
|
ORDER BY embedding <-> %s::vector
|
|||
|
|
LIMIT %s;
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
# Добавляем параметры в правильном порядке
|
|||
|
|
params = [embedding_str] + params + [embedding_str, request.limit]
|
|||
|
|
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
cur.execute(query, params)
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
for row in cur.fetchall():
|
|||
|
|
distance = row['distance']
|
|||
|
|
if distance < 0.9:
|
|||
|
|
relevance = "🟢 Высокая"
|
|||
|
|
elif distance < 1.0:
|
|||
|
|
relevance = "🟡 Средняя"
|
|||
|
|
else:
|
|||
|
|
relevance = "🔴 Низкая"
|
|||
|
|
|
|||
|
|
results.append(SearchResult(
|
|||
|
|
hotel_name=row['hotel_name'] or "Неизвестный отель",
|
|||
|
|
region_name=row['region_name'] or "Неизвестный регион",
|
|||
|
|
url=row['url'] or "",
|
|||
|
|
text=row['text'] or "",
|
|||
|
|
distance=float(distance),
|
|||
|
|
relevance=relevance
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|||
|
|
|
|||
|
|
@app.get("/regions")
|
|||
|
|
async def get_regions():
|
|||
|
|
"""Получить список регионов с эмбеддингами"""
|
|||
|
|
try:
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
|||
|
|
COUNT(*) as chunks_count
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE metadata->>'region_name' IS NOT NULL
|
|||
|
|
GROUP BY metadata->>'region_name'
|
|||
|
|
ORDER BY chunks_count DESC;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
regions = []
|
|||
|
|
for row in cur.fetchall():
|
|||
|
|
regions.append({
|
|||
|
|
"region_name": row['region_name'],
|
|||
|
|
"hotels_count": row['hotels_count'],
|
|||
|
|
"chunks_count": row['chunks_count']
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return {"regions": regions}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|||
|
|
|
|||
|
|
@app.get("/hotels")
|
|||
|
|
async def get_hotels(region: Optional[str] = None):
|
|||
|
|
"""Получить список отелей с эмбеддингами"""
|
|||
|
|
try:
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
if region:
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT DISTINCT
|
|||
|
|
metadata->>'hotel_id' as hotel_id,
|
|||
|
|
metadata->>'hotel_name' as hotel_name,
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(*) as chunks_count
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE metadata->>'region_name' = %s
|
|||
|
|
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
|
|||
|
|
ORDER BY chunks_count DESC;
|
|||
|
|
""", (region,))
|
|||
|
|
else:
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT DISTINCT
|
|||
|
|
metadata->>'hotel_id' as hotel_id,
|
|||
|
|
metadata->>'hotel_name' as hotel_name,
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(*) as chunks_count
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
|
|||
|
|
ORDER BY chunks_count DESC;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
hotels = []
|
|||
|
|
for row in cur.fetchall():
|
|||
|
|
hotels.append({
|
|||
|
|
"hotel_id": row['hotel_id'],
|
|||
|
|
"hotel_name": row['hotel_name'],
|
|||
|
|
"region_name": row['region_name'],
|
|||
|
|
"chunks_count": row['chunks_count']
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return {"hotels": hotels}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|||
|
|
|
|||
|
|
@app.get("/stats")
|
|||
|
|
async def get_stats():
|
|||
|
|
"""Получить статистику по эмбеддингам"""
|
|||
|
|
try:
|
|||
|
|
conn = get_db_connection()
|
|||
|
|
cur = conn.cursor()
|
|||
|
|
|
|||
|
|
# Общая статистика
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
COUNT(*) as total_chunks,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as total_hotels,
|
|||
|
|
COUNT(DISTINCT metadata->>'region_name') as total_regions,
|
|||
|
|
AVG(LENGTH(text)) as avg_chunk_length
|
|||
|
|
FROM hotel_website_chunks;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
stats = cur.fetchone()
|
|||
|
|
|
|||
|
|
# Статистика по регионам
|
|||
|
|
cur.execute("""
|
|||
|
|
SELECT
|
|||
|
|
metadata->>'region_name' as region_name,
|
|||
|
|
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
|||
|
|
COUNT(*) as chunks_count
|
|||
|
|
FROM hotel_website_chunks
|
|||
|
|
WHERE metadata->>'region_name' IS NOT NULL
|
|||
|
|
GROUP BY metadata->>'region_name'
|
|||
|
|
ORDER BY chunks_count DESC;
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
regions_stats = []
|
|||
|
|
for row in cur.fetchall():
|
|||
|
|
regions_stats.append({
|
|||
|
|
"region_name": row['region_name'],
|
|||
|
|
"hotels_count": row['hotels_count'],
|
|||
|
|
"chunks_count": row['chunks_count']
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
cur.close()
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"total_chunks": stats['total_chunks'],
|
|||
|
|
"total_hotels": stats['total_hotels'],
|
|||
|
|
"total_regions": stats['total_regions'],
|
|||
|
|
"avg_chunk_length": float(stats['avg_chunk_length']) if stats['avg_chunk_length'] else 0,
|
|||
|
|
"regions": regions_stats
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import uvicorn
|
|||
|
|
uvicorn.run(app, host="0.0.0.0", port=8001)
|