Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
294
semantic_search_api.py
Normal file
294
semantic_search_api.py
Normal file
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
API для семантического поиска по эмбеддингам
|
||||
Интеграция с веб-интерфейсом
|
||||
"""
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Optional
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from urllib.parse import unquote
|
||||
import requests
|
||||
import json
|
||||
|
||||
app = FastAPI(
|
||||
title="Semantic Search API",
|
||||
description="API для семантического поиска по эмбеддингам отелей",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Конфигурация
|
||||
DB_CONFIG = {
|
||||
'host': "147.45.189.234",
|
||||
'port': 5432,
|
||||
'database': "default_db",
|
||||
'user': "gen_user",
|
||||
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
|
||||
}
|
||||
|
||||
BGE_API_URL = "http://147.45.146.17:8002/embed"
|
||||
BGE_API_KEY = "22564b177aa73b6ac0b8642d7773350ff4c01d4983f028beff15ea247f09fa89"
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
region: Optional[str] = None
|
||||
hotel_id: Optional[str] = None
|
||||
limit: int = 10
|
||||
min_distance: float = 0.3
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
hotel_name: str
|
||||
region_name: str
|
||||
url: str
|
||||
text: str
|
||||
distance: float
|
||||
relevance: str
|
||||
|
||||
def get_db_connection():
|
||||
"""Получить подключение к БД"""
|
||||
return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
|
||||
|
||||
def generate_query_embedding(query: str):
|
||||
"""Генерация эмбеддинга для поискового запроса"""
|
||||
try:
|
||||
headers = {
|
||||
"X-API-Key": BGE_API_KEY,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {"text": query}
|
||||
response = requests.post(BGE_API_URL, json=payload, headers=headers, timeout=30)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return result.get('embeddings', [[]])[0]
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail=f"BGE API error: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Embedding generation error: {str(e)}")
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Информация об API"""
|
||||
return {
|
||||
"name": "Semantic Search API",
|
||||
"version": "1.0.0",
|
||||
"description": "API для семантического поиска по эмбеддингам отелей",
|
||||
"endpoints": [
|
||||
"POST /search - Семантический поиск",
|
||||
"GET /regions - Список регионов",
|
||||
"GET /hotels - Список отелей",
|
||||
"GET /stats - Статистика"
|
||||
]
|
||||
}
|
||||
|
||||
@app.post("/search", response_model=List[SearchResult])
|
||||
async def semantic_search(request: SearchRequest):
|
||||
"""Семантический поиск по эмбеддингам"""
|
||||
try:
|
||||
# Генерируем эмбеддинг для запроса
|
||||
query_embedding = generate_query_embedding(request.query)
|
||||
embedding_str = json.dumps(query_embedding)
|
||||
|
||||
# Строим SQL запрос с фильтрами
|
||||
where_conditions = ["embedding IS NOT NULL"]
|
||||
params = []
|
||||
|
||||
if request.region:
|
||||
where_conditions.append("metadata->>'region_name' = %s")
|
||||
params.append(request.region)
|
||||
|
||||
if request.hotel_id:
|
||||
where_conditions.append("metadata->>'hotel_id' = %s")
|
||||
params.append(request.hotel_id)
|
||||
|
||||
where_clause = " AND ".join(where_conditions)
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
metadata->>'hotel_name' as hotel_name,
|
||||
metadata->>'region_name' as region_name,
|
||||
metadata->>'url' as url,
|
||||
LEFT(text, 300) as text,
|
||||
embedding <-> %s::vector as distance
|
||||
FROM hotel_website_chunks
|
||||
WHERE {where_clause}
|
||||
ORDER BY embedding <-> %s::vector
|
||||
LIMIT %s;
|
||||
"""
|
||||
|
||||
# Добавляем параметры в правильном порядке
|
||||
params = [embedding_str] + params + [embedding_str, request.limit]
|
||||
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
cur.execute(query, params)
|
||||
|
||||
results = []
|
||||
for row in cur.fetchall():
|
||||
distance = row['distance']
|
||||
if distance < 0.9:
|
||||
relevance = "🟢 Высокая"
|
||||
elif distance < 1.0:
|
||||
relevance = "🟡 Средняя"
|
||||
else:
|
||||
relevance = "🔴 Низкая"
|
||||
|
||||
results.append(SearchResult(
|
||||
hotel_name=row['hotel_name'] or "Неизвестный отель",
|
||||
region_name=row['region_name'] or "Неизвестный регион",
|
||||
url=row['url'] or "",
|
||||
text=row['text'] or "",
|
||||
distance=float(distance),
|
||||
relevance=relevance
|
||||
))
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/regions")
|
||||
async def get_regions():
|
||||
"""Получить список регионов с эмбеддингами"""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT
|
||||
metadata->>'region_name' as region_name,
|
||||
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
||||
COUNT(*) as chunks_count
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'region_name' IS NOT NULL
|
||||
GROUP BY metadata->>'region_name'
|
||||
ORDER BY chunks_count DESC;
|
||||
""")
|
||||
|
||||
regions = []
|
||||
for row in cur.fetchall():
|
||||
regions.append({
|
||||
"region_name": row['region_name'],
|
||||
"hotels_count": row['hotels_count'],
|
||||
"chunks_count": row['chunks_count']
|
||||
})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return {"regions": regions}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/hotels")
|
||||
async def get_hotels(region: Optional[str] = None):
|
||||
"""Получить список отелей с эмбеддингами"""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
if region:
|
||||
cur.execute("""
|
||||
SELECT DISTINCT
|
||||
metadata->>'hotel_id' as hotel_id,
|
||||
metadata->>'hotel_name' as hotel_name,
|
||||
metadata->>'region_name' as region_name,
|
||||
COUNT(*) as chunks_count
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'region_name' = %s
|
||||
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
|
||||
ORDER BY chunks_count DESC;
|
||||
""", (region,))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT DISTINCT
|
||||
metadata->>'hotel_id' as hotel_id,
|
||||
metadata->>'hotel_name' as hotel_name,
|
||||
metadata->>'region_name' as region_name,
|
||||
COUNT(*) as chunks_count
|
||||
FROM hotel_website_chunks
|
||||
GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name'
|
||||
ORDER BY chunks_count DESC;
|
||||
""")
|
||||
|
||||
hotels = []
|
||||
for row in cur.fetchall():
|
||||
hotels.append({
|
||||
"hotel_id": row['hotel_id'],
|
||||
"hotel_name": row['hotel_name'],
|
||||
"region_name": row['region_name'],
|
||||
"chunks_count": row['chunks_count']
|
||||
})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return {"hotels": hotels}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/stats")
|
||||
async def get_stats():
|
||||
"""Получить статистику по эмбеддингам"""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
# Общая статистика
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_chunks,
|
||||
COUNT(DISTINCT metadata->>'hotel_id') as total_hotels,
|
||||
COUNT(DISTINCT metadata->>'region_name') as total_regions,
|
||||
AVG(LENGTH(text)) as avg_chunk_length
|
||||
FROM hotel_website_chunks;
|
||||
""")
|
||||
|
||||
stats = cur.fetchone()
|
||||
|
||||
# Статистика по регионам
|
||||
cur.execute("""
|
||||
SELECT
|
||||
metadata->>'region_name' as region_name,
|
||||
COUNT(DISTINCT metadata->>'hotel_id') as hotels_count,
|
||||
COUNT(*) as chunks_count
|
||||
FROM hotel_website_chunks
|
||||
WHERE metadata->>'region_name' IS NOT NULL
|
||||
GROUP BY metadata->>'region_name'
|
||||
ORDER BY chunks_count DESC;
|
||||
""")
|
||||
|
||||
regions_stats = []
|
||||
for row in cur.fetchall():
|
||||
regions_stats.append({
|
||||
"region_name": row['region_name'],
|
||||
"hotels_count": row['hotels_count'],
|
||||
"chunks_count": row['chunks_count']
|
||||
})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
"total_chunks": stats['total_chunks'],
|
||||
"total_hotels": stats['total_hotels'],
|
||||
"total_regions": stats['total_regions'],
|
||||
"avg_chunk_length": float(stats['avg_chunk_length']) if stats['avg_chunk_length'] else 0,
|
||||
"regions": regions_stats
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8001)
|
||||
Reference in New Issue
Block a user