🚀 Full project sync: Hotels RAG & Audit System

 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
This commit is contained in:
Фёдор
2025-10-27 22:49:42 +03:00
parent 0cf3297290
commit 684fada337
94 changed files with 14891 additions and 911 deletions

962
db_schema_hotels.json Normal file
View File

@@ -0,0 +1,962 @@
{
"hotel_additional_info": [
{
"name": "hotel_id",
"type": "uuid",
"nullable": "NO",
"default": null
},
{
"name": "owner_ogrn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_inn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_kpp",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_short_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_phone",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_email",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "resort_full_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_address_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_legal_type_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "phone",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "email",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_audit_results": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_audit_results_id_seq'::regclass)"
},
{
"name": "hotel_id",
"type": "uuid",
"nullable": "YES",
"default": null
},
{
"name": "region_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "hotel_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "website",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "has_website",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "criteria_results",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "total_score",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "max_score",
"type": "integer",
"nullable": "YES",
"default": "20"
},
{
"name": "score_percentage",
"type": "double precision",
"nullable": "YES",
"default": null
},
{
"name": "audit_date",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
},
{
"name": "audit_version",
"type": "text",
"nullable": "YES",
"default": null
}
],
"hotel_main": [
{
"name": "id",
"type": "uuid",
"nullable": "NO",
"default": null
},
{
"name": "full_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "short_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "status_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "status_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "category_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "category_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "region_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "region_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "hotel_type_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "hotel_type_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "register_record",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "register_record_date",
"type": "date",
"nullable": "YES",
"default": null
},
{
"name": "owner_full_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_ogrn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "owner_inn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "phone",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "email",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "website_address",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "addresses",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "photo_ids",
"type": "ARRAY",
"nullable": "YES",
"default": null
},
{
"name": "has_seasonal",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "activation_datetime",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "updated",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
},
{
"name": "updated_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
},
{
"name": "website_status",
"type": "character varying",
"nullable": "YES",
"default": "'not_checked'::character varying"
},
{
"name": "rkn_registry_status",
"type": "character varying",
"nullable": "YES",
"default": null
},
{
"name": "rkn_registry_number",
"type": "character varying",
"nullable": "YES",
"default": null
},
{
"name": "rkn_registry_date",
"type": "character varying",
"nullable": "YES",
"default": null
},
{
"name": "rkn_checked_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
}
],
"hotel_parsing_progress": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_parsing_progress_id_seq'::regclass)"
},
{
"name": "page_number",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "total_pages",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "processed_count",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "total_count",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "status",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "error_message",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "started_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "completed_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_raw_json": [
{
"name": "hotel_id",
"type": "uuid",
"nullable": "NO",
"default": null
},
{
"name": "main_data",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "additional_info",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "sanatorium_data",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "drawer_data",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_rooms": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_rooms_id_seq'::regclass)"
},
{
"name": "hotel_id",
"type": "uuid",
"nullable": "YES",
"default": null
},
{
"name": "room_category_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "room_category_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "apartment_count",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "number_seats",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "equipment_list",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "family_room_count",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "disability_room_count",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_sanatorium": [
{
"name": "hotel_id",
"type": "uuid",
"nullable": "NO",
"default": null
},
{
"name": "oid",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "full_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "short_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "ogrn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "inn",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "legal_address",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "actual_address",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "phone",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "email",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "web_site",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "medical_license",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "farm_license",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "terrenkur",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "resort_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "has_water_supply",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_heating",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_sewage",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_air_conditioning",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_elevator",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_telephone",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_internet",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_mobility_lift",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_gym",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_conference_room",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "swimming_pool_info",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "plage_info",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "land_document_info",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "rooms_info",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_services": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_services_id_seq'::regclass)"
},
{
"name": "hotel_id",
"type": "uuid",
"nullable": "YES",
"default": null
},
{
"name": "service_category_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "service_category_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "service_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "service_name",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_website_chunks": [
{
"name": "id",
"type": "uuid",
"nullable": "NO",
"default": "gen_random_uuid()"
},
{
"name": "text",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "metadata",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "embedding",
"type": "USER-DEFINED",
"nullable": "YES",
"default": null
}
],
"hotel_website_meta": [
{
"name": "hotel_id",
"type": "uuid",
"nullable": "NO",
"default": null
},
{
"name": "domain",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "main_url",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "pages_crawled",
"type": "integer",
"nullable": "YES",
"default": "0"
},
{
"name": "pages_failed",
"type": "integer",
"nullable": "YES",
"default": "0"
},
{
"name": "total_size_bytes",
"type": "bigint",
"nullable": "YES",
"default": "0"
},
{
"name": "internal_links_found",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "crawl_status",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "crawl_started_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "crawl_finished_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
},
{
"name": "error_message",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "created_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
},
{
"name": "updated_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_website_processed": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_website_processed_id_seq'::regclass)"
},
{
"name": "raw_page_id",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "hotel_id",
"type": "uuid",
"nullable": "YES",
"default": null
},
{
"name": "url",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "cleaned_text",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "extracted_data",
"type": "jsonb",
"nullable": "YES",
"default": null
},
{
"name": "has_forms",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "has_booking",
"type": "boolean",
"nullable": "YES",
"default": null
},
{
"name": "text_length",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "processed_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
}
],
"hotel_website_raw": [
{
"name": "id",
"type": "integer",
"nullable": "NO",
"default": "nextval('hotel_website_raw_id_seq'::regclass)"
},
{
"name": "hotel_id",
"type": "uuid",
"nullable": "YES",
"default": null
},
{
"name": "url",
"type": "text",
"nullable": "NO",
"default": null
},
{
"name": "page_title",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "html",
"type": "text",
"nullable": "YES",
"default": null
},
{
"name": "status_code",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "response_time_ms",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "depth",
"type": "integer",
"nullable": "YES",
"default": null
},
{
"name": "crawled_at",
"type": "timestamp without time zone",
"nullable": "YES",
"default": "CURRENT_TIMESTAMP"
},
{
"name": "last_modified",
"type": "timestamp without time zone",
"nullable": "YES",
"default": null
}
]
}