Files
hotels/semantic_search_api.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

295 lines
9.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
API для семантического поиска по эмбеддингам
Интеграция с веб-интерфейсом
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Optional
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import requests
import json
app = FastAPI(
title="Semantic Search API",
description="API для семантического поиска по эмбеддингам отелей",
version="1.0.0"
)
# Конфигурация
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
BGE_API_URL = "http://147.45.146.17:8002/embed"
BGE_API_KEY = "22564b177aa73b6ac0b8642d7773350ff4c01d4983f028beff15ea247f09fa89"
class SearchRequest(BaseModel):
query: str
region: Optional[str] = None
hotel_id: Optional[str] = None
limit: int = 10
min_distance: float = 0.3
class SearchResult(BaseModel):
hotel_name: str
region_name: str
url: str
text: str
distance: float
relevance: str
def get_db_connection():
"""Получить подключение к БД"""
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
def generate_query_embedding(query: str):
"""Генерация эмбеддинга для поискового запроса"""
try:
headers = {
"X-API-Key": BGE_API_KEY,
"Content-Type": "application/json"
}
payload = {"text": query}
response = requests.post(BGE_API_URL, json=payload, headers=headers, timeout=30)
if response.status_code == 200:
result = response.json()
return result.get('embeddings', [[]])[0]
else:
raise HTTPException(status_code=500, detail=f"BGE API error: {response.status_code}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Embedding generation error: {str(e)}")
@app.get("/")
async def root():
"""Информация об API"""
return {
"name": "Semantic Search API",
"version": "1.0.0",
"description": "API для семантического поиска по эмбеддингам отелей",
"endpoints": [
"POST /search - Семантический поиск",
"GET /regions - Список регионов",
"GET /hotels - Список отелей",
"GET /stats - Статистика"
]
}
@app.post("/search", response_model=List[SearchResult])
async def semantic_search(request: SearchRequest):
"""Семантический поиск по эмбеддингам"""
try:
# Генерируем эмбеддинг для запроса
query_embedding = generate_query_embedding(request.query)
embedding_str = json.dumps(query_embedding)
# Строим SQL запрос с фильтрами
where_conditions = ["embedding IS NOT NULL"]
params = []
if request.region:
where_conditions.append("metadata->>'region_name' = %s")
params.append(request.region)
if request.hotel_id:
where_conditions.append("metadata->>'hotel_id' = %s")
params.append(request.hotel_id)
where_clause = " AND ".join(where_conditions)
query = f"""
SELECT
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
metadata->>'url' as url,
LEFT(text, 300) as text,
embedding <-> %s::vector as distance
FROM hotel_website_chunks
WHERE {where_clause}
ORDER BY embedding <-> %s::vector
LIMIT %s;
"""
# Добавляем параметры в правильном порядке
params = [embedding_str] + params + [embedding_str, request.limit]
conn = get_db_connection()
cur = conn.cursor()
cur.execute(query, params)
results = []
for row in cur.fetchall():
distance = row['distance']
if distance < 0.9:
relevance = "🟢 Высокая"
elif distance < 1.0:
relevance = "🟡 Средняя"
else:
relevance = "🔴 Низкая"
results.append(SearchResult(
hotel_name=row['hotel_name'] or "Неизвестный отель",
region_name=row['region_name'] or "Неизвестный регион",
url=row['url'] or "",
text=row['text'] or "",
distance=float(distance),
relevance=relevance
))
cur.close()
conn.close()
return results
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/regions")
async def get_regions():
"""Получить список регионов с эмбеддингами"""
try:
conn = get_db_connection()
cur = conn.cursor()
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions = []
for row in cur.fetchall():
regions.append({
"region_name": row['region_name'],
"hotels_count": row['hotels_count'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {"regions": regions}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/hotels")
async def get_hotels(region: Optional[str] = None):
"""Получить список отелей с эмбеддингами"""
try:
conn = get_db_connection()
cur = conn.cursor()
if region:
cur.execute("""
SELECT DISTINCT
metadata->>'hotel_id' as hotel_id,
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' = %s
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
ORDER BY chunks_count DESC;
""", (region,))
else:
cur.execute("""
SELECT DISTINCT
metadata->>'hotel_id' as hotel_id,
metadata->>'hotel_name' as hotel_name,
metadata->>'region_name' as region_name,
COUNT(*) as chunks_count
FROM hotel_website_chunks
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
hotels = []
for row in cur.fetchall():
hotels.append({
"hotel_id": row['hotel_id'],
"hotel_name": row['hotel_name'],
"region_name": row['region_name'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {"hotels": hotels}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/stats")
async def get_stats():
"""Получить статистику по эмбеддингам"""
try:
conn = get_db_connection()
cur = conn.cursor()
# Общая статистика
cur.execute("""
SELECT
COUNT(*) as total_chunks,
COUNT(DISTINCT metadata->>'hotel_id') as total_hotels,
COUNT(DISTINCT metadata->>'region_name') as total_regions,
AVG(LENGTH(text)) as avg_chunk_length
FROM hotel_website_chunks;
""")
stats = cur.fetchone()
# Статистика по регионам
cur.execute("""
SELECT
metadata->>'region_name' as region_name,
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
COUNT(*) as chunks_count
FROM hotel_website_chunks
WHERE metadata->>'region_name' IS NOT NULL
GROUP BY metadata->>'region_name'
ORDER BY chunks_count DESC;
""")
regions_stats = []
for row in cur.fetchall():
regions_stats.append({
"region_name": row['region_name'],
"hotels_count": row['hotels_count'],
"chunks_count": row['chunks_count']
})
cur.close()
conn.close()
return {
"total_chunks": stats['total_chunks'],
"total_hotels": stats['total_hotels'],
"total_regions": stats['total_regions'],
"avg_chunk_length": float(stats['avg_chunk_length']) if stats['avg_chunk_length'] else 0,
"regions": regions_stats
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)