Files
hotels/semantic_search_api.py

295 lines
9.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
API для семантического поиска по эмбеддингам
Интеграция с веб-интерфейсом
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Optional
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import requests
import json
app = FastAPI(
title="Semantic Search API",
description="API для семантического поиска по эмбеддингам отелей",
version="1.0.0"
)
# Конфигурация
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
BGE_API_URL = "http://147.45.146.17:8002/embed"
BGE_API_KEY = "22564b177aa73b6ac0b8642d7773350ff4c01d4983f028beff15ea247f09fa89"
class SearchRequest(BaseModel):
query: str
region: Optional[str] = None
hotel_id: Optional[str] = None
limit: int = 10
min_distance: float = 0.3
class SearchResult(BaseModel):
hotel_name: str
region_name: str
url: str
text: str
distance: float
relevance: str
def get_db_connection():
"""Получить подключение к БД"""
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
def generate_query_embedding(query: str):
"""Генерация эмбеддинга для поискового запроса"""
try:
headers = {
"X-API-Key": BGE_API_KEY,
"Content-Type": "application/json"
}
payload = {"text": query}
response = requests.post(BGE_API_URL, json=payload, headers=headers, timeout=30)
if response.status_code == 200:
result = response.json()
return result.get('embeddings', [[]])[0]
else:
raise HTTPException(status_code=500, detail=f"BGE API error: {response.status_code}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Embedding generation error: {str(e)}")
@app.get("/")
async def root():
"""Информация об API"""
return {
"name": "Semantic Search API",
"version": "1.0.0",
"description": "API для семантического поиска по эмбеддингам отелей",
"endpoints": [
"POST /search - Семантический поиск",
"GET /regions - Список регионов",
"GET /hotels - Список отелей",
"GET /stats - Статистика"
]
}
@app.post("/search", response_model=List[SearchResult])
async def semantic_search(request: SearchRequest):
"""Семантический поиск по эмбеддингам"""
try:
# Генерируем эмбеддинг для запроса
query_embedding = generate_query_embedding(request.query)
embedding_str = json.dumps(query_embedding)
# Строим SQL запрос с фильтрами
where_conditions = ["embedding IS NOT NULL"]
params = []
if request.region:
where_conditions.append("metadata->>'region_name' = %s")
params.append(request.region)
if request.hotel_id:
where_conditions.append("metadata->>'hotel_id' = %s")
params.append(request.hotel_id)
where_clause = " AND ".join(where_conditions)
query = f"""
SELECT
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
metadata->>'url' as url,
LEFT(text, 300) as text,
embedding <-> %s::vector as distance
FROM hotel_website_chunks
WHERE {where_clause}
ORDER BY embedding <-> %s::vector
LIMIT %s;
"""
# Добавляем параметры в правильном порядке
params = [embedding_str] + params + [embedding_str, request.limit]
conn = get_db_connection()
cur = conn.cursor()
cur.execute(query, params)
results = []
for row in cur.fetchall():
distance = row['distance']
if distance < 0.9:
relevance = "🟢 Высокая"
elif distance < 1.0:
relevance = "🟡 Средняя"
else:
relevance = "🔴 Низкая"
results.append(SearchResult(
hotel_name=row['hotel_name'] or "Неизвестный отель",
region_name=row['region_name'] or "Неизвестный регион",
url=row['url'] or "",
text=row['text'] or "",
distance=float(distance),
relevance=relevance
))
cur.close()
conn.close()
return results
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/regions")
async def get_regions():
"""Получить список регионов с эмбеддингами"""
try:
conn = get_db_connection()
cur = conn.cursor()
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions = []
for row in cur.fetchall():
regions.append({
"region_name": row['region_name'],
"hotels_count": row['hotels_count'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {"regions": regions}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/hotels")
async def get_hotels(region: Optional[str] = None):
"""Получить список отелей с эмбеддингами"""
try:
conn = get_db_connection()
cur = conn.cursor()
if region:
cur.execute("""
SELECT DISTINCT
metadata->>'hotel_id' as hotel_id,
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' = %s
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
ORDER BY chunks_count DESC;
""", (region,))
else:
cur.execute("""
SELECT DISTINCT
metadata->>'hotel_id' as hotel_id,
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
COUNT(*) as chunks_count
FROM hotel_website_chunks
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
hotels = []
for row in cur.fetchall():
hotels.append({
"hotel_id": row['hotel_id'],
"hotel_name": row['hotel_name'],
"region_name": row['region_name'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {"hotels": hotels}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/stats")
async def get_stats():
"""Получить статистику по эмбеддингам"""
try:
conn = get_db_connection()
cur = conn.cursor()
# Общая статистика
cur.execute("""
SELECT
COUNT(*) as total_chunks,
COUNT(DISTINCT metadata->>'hotel_id') as total_hotels,
COUNT(DISTINCT metadata->>'region_name') as total_regions,
AVG(LENGTH(text)) as avg_chunk_length
FROM hotel_website_chunks;
""")
stats = cur.fetchone()
# Статистика по регионам
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions_stats = []
for row in cur.fetchall():
regions_stats.append({
"region_name": row['region_name'],
"hotels_count": row['hotels_count'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {
"total_chunks": stats['total_chunks'],
"total_hotels": stats['total_hotels'],
"total_regions": stats['total_regions'],
"avg_chunk_length": float(stats['avg_chunk_length']) if stats['avg_chunk_length'] else 0,
"regions": regions_stats
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)