🚀 Full project sync: Hotels RAG & Audit System

 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
This commit is contained in:
Фёдор
2025-10-27 22:49:42 +03:00
parent 0cf3297290
commit 684fada337
94 changed files with 14891 additions and 911 deletions

60
website_schema.sql Normal file
View File

@@ -0,0 +1,60 @@
-- Схема для хранения сырых данных с сайтов отелей
-- Сырой HTML со страниц
CREATE TABLE IF NOT EXISTS hotel_website_raw (
id SERIAL PRIMARY KEY,
hotel_id UUID REFERENCES hotel_main(id),
url TEXT NOT NULL,
page_title TEXT,
html TEXT, -- Сырой HTML
status_code INTEGER,
response_time_ms INTEGER,
depth INTEGER, -- 0 = главная, 1 = внутренняя ссылка
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(hotel_id, url)
);
-- Метаинформация о парсинге сайта
CREATE TABLE IF NOT EXISTS hotel_website_meta (
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
domain TEXT,
main_url TEXT,
pages_crawled INTEGER DEFAULT 0,
pages_failed INTEGER DEFAULT 0,
total_size_bytes BIGINT DEFAULT 0,
internal_links_found INTEGER,
crawl_status TEXT, -- 'in_progress', 'completed', 'failed'
crawl_started_at TIMESTAMP,
crawl_finished_at TIMESTAMP,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Обработанный текст (после очистки, для векторизации)
CREATE TABLE IF NOT EXISTS hotel_website_processed (
id SERIAL PRIMARY KEY,
raw_page_id INTEGER REFERENCES hotel_website_raw(id),
hotel_id UUID REFERENCES hotel_main(id),
url TEXT,
cleaned_text TEXT, -- Очищенный текст
extracted_data JSONB, -- Телефоны, email, ИНН, ОГРН и т.д.
has_forms BOOLEAN,
has_booking BOOLEAN,
text_length INTEGER,
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Индексы
CREATE INDEX IF NOT EXISTS idx_website_raw_hotel_id ON hotel_website_raw(hotel_id);
CREATE INDEX IF NOT EXISTS idx_website_raw_url ON hotel_website_raw(url);
CREATE INDEX IF NOT EXISTS idx_website_meta_status ON hotel_website_meta(crawl_status);
CREATE INDEX IF NOT EXISTS idx_website_processed_hotel_id ON hotel_website_processed(hotel_id);
COMMENT ON TABLE hotel_website_raw IS 'Сырой HTML со страниц сайтов отелей (исходники)';
COMMENT ON TABLE hotel_website_meta IS 'Метаинформация о краулинге сайтов';
COMMENT ON TABLE hotel_website_processed IS 'Обработанный текст для векторизации';