- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
108 lines
3.8 KiB
Python
108 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Прямой semantic search по данным отелей в Neo4j
|
|
Использует vector similarity для поиска релевантных чанков
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import requests
|
|
from neo4j import GraphDatabase
|
|
from urllib.parse import unquote
|
|
|
|
# Neo4j
|
|
NEO4J_URI = "bolt://localhost:7687"
|
|
NEO4J_USER = "neo4j"
|
|
NEO4J_PASSWORD = "supersecret"
|
|
|
|
# OpenAI для генерации query embedding
|
|
OPENAI_API_KEY = "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"
|
|
OPENAI_API_BASE = "https://api.openai.com/v1"
|
|
HTTP_PROXY = "http://195.133.66.13:3128"
|
|
|
|
def generate_embedding(text: str):
|
|
"""Генерирует эмбеддинг для текста"""
|
|
response = requests.post(
|
|
f"{OPENAI_API_BASE}/embeddings",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": "text-embedding-3-small",
|
|
"input": text
|
|
},
|
|
proxies={"http": HTTP_PROXY, "https": HTTP_PROXY},
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()["data"][0]["embedding"]
|
|
else:
|
|
raise Exception(f"OpenAI API error: {response.status_code}")
|
|
|
|
def search_hotel_content(query: str, group_id: str = "hotel_spb", limit: int = 5):
|
|
"""Поиск по контенту отеля через vector similarity"""
|
|
|
|
print(f"🔍 Запрос: {query}")
|
|
print(f"📊 Group ID: {group_id}")
|
|
print(f"🎯 Limit: {limit}\n")
|
|
|
|
# Генерируем эмбеддинг запроса
|
|
print("⚙️ Генерирую эмбеддинг запроса...")
|
|
query_embedding = generate_embedding(query)
|
|
print(f"✓ Эмбеддинг: {len(query_embedding)} размерность\n")
|
|
|
|
# Подключаемся к Neo4j
|
|
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
|
|
|
|
with driver.session() as session:
|
|
# Vector similarity search
|
|
print("🔎 Поиск похожих эпизодов...\n")
|
|
|
|
result = session.run("""
|
|
MATCH (e:Episode)
|
|
WHERE e.group_id = $group_id
|
|
AND e.embedding IS NOT NULL
|
|
AND size(e.embedding) > 0
|
|
WITH e,
|
|
reduce(dot = 0.0, i IN range(0, size(e.embedding)-1) |
|
|
dot + e.embedding[i] * $query_embedding[i]
|
|
) / (
|
|
sqrt(reduce(sum = 0.0, x IN e.embedding | sum + x * x)) *
|
|
sqrt(reduce(sum = 0.0, x IN $query_embedding | sum + x * x))
|
|
) AS similarity
|
|
WHERE similarity > 0.3
|
|
RETURN e.name AS name,
|
|
e.content AS content,
|
|
similarity
|
|
ORDER BY similarity DESC
|
|
LIMIT $limit
|
|
""", {
|
|
"group_id": group_id,
|
|
"query_embedding": query_embedding,
|
|
"limit": limit
|
|
})
|
|
|
|
results = list(result)
|
|
|
|
if results:
|
|
print(f"✅ Найдено {len(results)} релевантных результатов:\n")
|
|
print("=" * 70)
|
|
|
|
for idx, record in enumerate(results, 1):
|
|
print(f"\n{idx}. SCORE: {record['similarity']:.3f}")
|
|
print(f" Name: {record['name']}")
|
|
print(f" Content:\n {record['content'][:300]}...")
|
|
print()
|
|
else:
|
|
print("❌ Ничего не найдено")
|
|
|
|
driver.close()
|
|
return results
|
|
|
|
if __name__ == "__main__":
|
|
query = sys.argv[1] if len(sys.argv) > 1 else "адрес отеля и контакты"
|
|
results = search_hotel_content(query)
|
|
|