Files
hotels/search_hotel_content.py

108 lines
3.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Прямой semantic search по данным отелей в Neo4j
Использует vector similarity для поиска релевантных чанков
"""
import os
import sys
import requests
from neo4j import GraphDatabase
from urllib.parse import unquote
# Neo4j
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "supersecret"
# OpenAI для генерации query embedding
OPENAI_API_KEY = "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"
OPENAI_API_BASE = "https://api.openai.com/v1"
HTTP_PROXY = "http://195.133.66.13:3128"
def generate_embedding(text: str):
"""Генерирует эмбеддинг для текста"""
response = requests.post(
f"{OPENAI_API_BASE}/embeddings",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-small",
"input": text
},
proxies={"http": HTTP_PROXY, "https": HTTP_PROXY},
timeout=30
)
if response.status_code == 200:
return response.json()["data"][0]["embedding"]
else:
raise Exception(f"OpenAI API error: {response.status_code}")
def search_hotel_content(query: str, group_id: str = "hotel_spb", limit: int = 5):
"""Поиск по контенту отеля через vector similarity"""
print(f"🔍 Запрос: {query}")
print(f"📊 Group ID: {group_id}")
print(f"🎯 Limit: {limit}\n")
# Генерируем эмбеддинг запроса
print("⚙️ Генерирую эмбеддинг запроса...")
query_embedding = generate_embedding(query)
print(f"✓ Эмбеддинг: {len(query_embedding)} размерность\n")
# Подключаемся к Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
with driver.session() as session:
# Vector similarity search
print("🔎 Поиск похожих эпизодов...\n")
result = session.run("""
MATCH (e:Episode)
WHERE e.group_id = $group_id
AND e.embedding IS NOT NULL
AND size(e.embedding) > 0
WITH e,
reduce(dot = 0.0, i IN range(0, size(e.embedding)-1) |
dot + e.embedding[i] * $query_embedding[i]
) / (
sqrt(reduce(sum = 0.0, x IN e.embedding | sum + x * x)) *
sqrt(reduce(sum = 0.0, x IN $query_embedding | sum + x * x))
) AS similarity
WHERE similarity > 0.3
RETURN e.name AS name,
e.content AS content,
similarity
ORDER BY similarity DESC
LIMIT $limit
""", {
"group_id": group_id,
"query_embedding": query_embedding,
"limit": limit
})
results = list(result)
if results:
print(f"✅ Найдено {len(results)} релевантных результатов:\n")
print("=" * 70)
for idx, record in enumerate(results, 1):
print(f"\n{idx}. SCORE: {record['similarity']:.3f}")
print(f" Name: {record['name']}")
print(f" Content:\n {record['content'][:300]}...")
print()
else:
print("❌ Ничего не найдено")
driver.close()
return results
if __name__ == "__main__":
query = sys.argv[1] if len(sys.argv) > 1 else "адрес отеля и контакты"
results = search_hotel_content(query)