🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
153
db_schema.sql
Normal file
153
db_schema.sql
Normal file
@@ -0,0 +1,153 @@
|
||||
-- Схема базы данных для хранения информации об отелях
|
||||
-- Префикс: hotel_
|
||||
|
||||
-- Основная информация об отелях
|
||||
CREATE TABLE IF NOT EXISTS hotel_main (
|
||||
id UUID PRIMARY KEY,
|
||||
full_name TEXT,
|
||||
short_name TEXT,
|
||||
status_id INTEGER,
|
||||
status_name TEXT,
|
||||
category_id INTEGER,
|
||||
category_name TEXT,
|
||||
region_id INTEGER,
|
||||
region_name TEXT,
|
||||
hotel_type_id INTEGER,
|
||||
hotel_type_name TEXT,
|
||||
register_record TEXT,
|
||||
register_record_date DATE,
|
||||
owner_full_name TEXT,
|
||||
owner_ogrn TEXT,
|
||||
owner_inn TEXT,
|
||||
phone TEXT,
|
||||
email TEXT,
|
||||
website_address TEXT,
|
||||
addresses JSONB,
|
||||
photo_ids TEXT[],
|
||||
has_seasonal BOOLEAN,
|
||||
activation_datetime TIMESTAMP,
|
||||
updated TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Дополнительная информация о владельце
|
||||
CREATE TABLE IF NOT EXISTS hotel_additional_info (
|
||||
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
|
||||
owner_ogrn TEXT,
|
||||
owner_inn TEXT,
|
||||
owner_kpp TEXT,
|
||||
owner_short_name TEXT,
|
||||
owner_phone TEXT,
|
||||
owner_email TEXT,
|
||||
resort_full_name TEXT,
|
||||
owner_address_name TEXT,
|
||||
owner_legal_type_id INTEGER,
|
||||
phone TEXT,
|
||||
email TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Санаторная информация (для санаториев)
|
||||
CREATE TABLE IF NOT EXISTS hotel_sanatorium (
|
||||
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
|
||||
oid TEXT,
|
||||
full_name TEXT,
|
||||
short_name TEXT,
|
||||
ogrn TEXT,
|
||||
inn TEXT,
|
||||
legal_address TEXT,
|
||||
actual_address TEXT,
|
||||
phone TEXT,
|
||||
email TEXT,
|
||||
web_site TEXT,
|
||||
medical_license TEXT,
|
||||
farm_license TEXT,
|
||||
terrenkur BOOLEAN,
|
||||
resort_name TEXT,
|
||||
has_water_supply BOOLEAN,
|
||||
has_heating BOOLEAN,
|
||||
has_sewage BOOLEAN,
|
||||
has_air_conditioning BOOLEAN,
|
||||
has_elevator BOOLEAN,
|
||||
has_telephone BOOLEAN,
|
||||
has_internet BOOLEAN,
|
||||
has_mobility_lift BOOLEAN,
|
||||
has_gym BOOLEAN,
|
||||
has_conference_room BOOLEAN,
|
||||
swimming_pool_info JSONB,
|
||||
plage_info JSONB,
|
||||
land_document_info JSONB,
|
||||
rooms_info JSONB,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Услуги отелей (из drawer)
|
||||
CREATE TABLE IF NOT EXISTS hotel_services (
|
||||
id SERIAL PRIMARY KEY,
|
||||
hotel_id UUID REFERENCES hotel_main(id),
|
||||
service_category_id INTEGER,
|
||||
service_category_name TEXT,
|
||||
service_id INTEGER,
|
||||
service_name TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(hotel_id, service_id)
|
||||
);
|
||||
|
||||
-- Информация о номерах
|
||||
CREATE TABLE IF NOT EXISTS hotel_rooms (
|
||||
id SERIAL PRIMARY KEY,
|
||||
hotel_id UUID REFERENCES hotel_main(id),
|
||||
room_category_id INTEGER,
|
||||
room_category_name TEXT,
|
||||
apartment_count INTEGER,
|
||||
number_seats INTEGER,
|
||||
equipment_list JSONB,
|
||||
family_room_count INTEGER,
|
||||
disability_room_count INTEGER,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Сырые JSON для backup (опционально)
|
||||
CREATE TABLE IF NOT EXISTS hotel_raw_json (
|
||||
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
|
||||
main_data JSONB,
|
||||
additional_info JSONB,
|
||||
sanatorium_data JSONB,
|
||||
drawer_data JSONB,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Прогресс парсинга
|
||||
CREATE TABLE IF NOT EXISTS hotel_parsing_progress (
|
||||
id SERIAL PRIMARY KEY,
|
||||
page_number INTEGER,
|
||||
total_pages INTEGER,
|
||||
processed_count INTEGER,
|
||||
total_count INTEGER,
|
||||
status TEXT, -- 'in_progress', 'completed', 'failed'
|
||||
error_message TEXT,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Индексы для быстрого поиска
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_region ON hotel_main(region_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_status ON hotel_main(status_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_category ON hotel_main(category_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_type ON hotel_main(hotel_type_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_full_name ON hotel_main(full_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_services_hotel_id ON hotel_services(hotel_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_rooms_hotel_id ON hotel_rooms(hotel_id);
|
||||
|
||||
-- Полнотекстовый поиск по названию и адресу
|
||||
CREATE INDEX IF NOT EXISTS idx_hotel_main_fulltext ON hotel_main
|
||||
USING gin(to_tsvector('russian', coalesce(full_name, '') || ' ' || coalesce(short_name, '')));
|
||||
|
||||
COMMENT ON TABLE hotel_main IS 'Основная информация об отелях из tourism.fsa.gov.ru';
|
||||
COMMENT ON TABLE hotel_parsing_progress IS 'Контрольные точки для возобновления парсинга';
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user