Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py - Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py - РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py - Отчёты: create_orel_horizontal_report.py - Обработка: process_all_hotels_embeddings.py - Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
107
search_hotel_content.py
Normal file
107
search_hotel_content.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Прямой semantic search по данным отелей в Neo4j
|
||||
Использует vector similarity для поиска релевантных чанков
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
from neo4j import GraphDatabase
|
||||
from urllib.parse import unquote
|
||||
|
||||
# Neo4j
|
||||
NEO4J_URI = "bolt://localhost:7687"
|
||||
NEO4J_USER = "neo4j"
|
||||
NEO4J_PASSWORD = "supersecret"
|
||||
|
||||
# OpenAI для генерации query embedding
|
||||
OPENAI_API_KEY = "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"
|
||||
OPENAI_API_BASE = "https://api.openai.com/v1"
|
||||
HTTP_PROXY = "http://195.133.66.13:3128"
|
||||
|
||||
def generate_embedding(text: str):
|
||||
"""Генерирует эмбеддинг для текста"""
|
||||
response = requests.post(
|
||||
f"{OPENAI_API_BASE}/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": "text-embedding-3-small",
|
||||
"input": text
|
||||
},
|
||||
proxies={"http": HTTP_PROXY, "https": HTTP_PROXY},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()["data"][0]["embedding"]
|
||||
else:
|
||||
raise Exception(f"OpenAI API error: {response.status_code}")
|
||||
|
||||
def search_hotel_content(query: str, group_id: str = "hotel_spb", limit: int = 5):
|
||||
"""Поиск по контенту отеля через vector similarity"""
|
||||
|
||||
print(f"🔍 Запрос: {query}")
|
||||
print(f"📊 Group ID: {group_id}")
|
||||
print(f"🎯 Limit: {limit}\n")
|
||||
|
||||
# Генерируем эмбеддинг запроса
|
||||
print("⚙️ Генерирую эмбеддинг запроса...")
|
||||
query_embedding = generate_embedding(query)
|
||||
print(f"✓ Эмбеддинг: {len(query_embedding)} размерность\n")
|
||||
|
||||
# Подключаемся к Neo4j
|
||||
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
|
||||
|
||||
with driver.session() as session:
|
||||
# Vector similarity search
|
||||
print("🔎 Поиск похожих эпизодов...\n")
|
||||
|
||||
result = session.run("""
|
||||
MATCH (e:Episode)
|
||||
WHERE e.group_id = $group_id
|
||||
AND e.embedding IS NOT NULL
|
||||
AND size(e.embedding) > 0
|
||||
WITH e,
|
||||
reduce(dot = 0.0, i IN range(0, size(e.embedding)-1) |
|
||||
dot + e.embedding[i] * $query_embedding[i]
|
||||
) / (
|
||||
sqrt(reduce(sum = 0.0, x IN e.embedding | sum + x * x)) *
|
||||
sqrt(reduce(sum = 0.0, x IN $query_embedding | sum + x * x))
|
||||
) AS similarity
|
||||
WHERE similarity > 0.3
|
||||
RETURN e.name AS name,
|
||||
e.content AS content,
|
||||
similarity
|
||||
ORDER BY similarity DESC
|
||||
LIMIT $limit
|
||||
""", {
|
||||
"group_id": group_id,
|
||||
"query_embedding": query_embedding,
|
||||
"limit": limit
|
||||
})
|
||||
|
||||
results = list(result)
|
||||
|
||||
if results:
|
||||
print(f"✅ Найдено {len(results)} релевантных результатов:\n")
|
||||
print("=" * 70)
|
||||
|
||||
for idx, record in enumerate(results, 1):
|
||||
print(f"\n{idx}. SCORE: {record['similarity']:.3f}")
|
||||
print(f" Name: {record['name']}")
|
||||
print(f" Content:\n {record['content'][:300]}...")
|
||||
print()
|
||||
else:
|
||||
print("❌ Ничего не найдено")
|
||||
|
||||
driver.close()
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
query = sys.argv[1] if len(sys.argv) > 1 else "адрес отеля и контакты"
|
||||
results = search_hotel_content(query)
|
||||
|
||||
Reference in New Issue
Block a user