#!/usr/bin/env python3 """ Прямой semantic search по данным отелей в Neo4j Использует vector similarity для поиска релевантных чанков """ import os import sys import requests from neo4j import GraphDatabase from urllib.parse import unquote # Neo4j NEO4J_URI = "bolt://localhost:7687" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "supersecret" # OpenAI для генерации query embedding OPENAI_API_KEY = "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" OPENAI_API_BASE = "https://api.openai.com/v1" HTTP_PROXY = "http://195.133.66.13:3128" def generate_embedding(text: str): """Генерирует эмбеддинг для текста""" response = requests.post( f"{OPENAI_API_BASE}/embeddings", headers={ "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }, json={ "model": "text-embedding-3-small", "input": text }, proxies={"http": HTTP_PROXY, "https": HTTP_PROXY}, timeout=30 ) if response.status_code == 200: return response.json()["data"][0]["embedding"] else: raise Exception(f"OpenAI API error: {response.status_code}") def search_hotel_content(query: str, group_id: str = "hotel_spb", limit: int = 5): """Поиск по контенту отеля через vector similarity""" print(f"🔍 Запрос: {query}") print(f"📊 Group ID: {group_id}") print(f"🎯 Limit: {limit}\n") # Генерируем эмбеддинг запроса print("⚙️ Генерирую эмбеддинг запроса...") query_embedding = generate_embedding(query) print(f"✓ Эмбеддинг: {len(query_embedding)} размерность\n") # Подключаемся к Neo4j driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) with driver.session() as session: # Vector similarity search print("🔎 Поиск похожих эпизодов...\n") result = session.run(""" MATCH (e:Episode) WHERE e.group_id = $group_id AND e.embedding IS NOT NULL AND size(e.embedding) > 0 WITH e, reduce(dot = 0.0, i IN range(0, size(e.embedding)-1) | dot + e.embedding[i] * $query_embedding[i] ) / ( sqrt(reduce(sum = 0.0, x IN e.embedding | sum + x * x)) * sqrt(reduce(sum = 0.0, x IN $query_embedding | sum + x * x)) ) AS similarity WHERE similarity > 0.3 RETURN e.name AS name, e.content AS content, similarity ORDER BY similarity DESC LIMIT $limit """, { "group_id": group_id, "query_embedding": query_embedding, "limit": limit }) results = list(result) if results: print(f"✅ Найдено {len(results)} релевантных результатов:\n") print("=" * 70) for idx, record in enumerate(results, 1): print(f"\n{idx}. SCORE: {record['similarity']:.3f}") print(f" Name: {record['name']}") print(f" Content:\n {record['content'][:300]}...") print() else: print("❌ Ничего не найдено") driver.close() return results if __name__ == "__main__": query = sys.argv[1] if len(sys.argv) > 1 else "адрес отеля и контакты" results = search_hotel_content(query)