Save all currently accumulated repository changes as a backup snapshot for Gitea so no local work is lost.
106 lines
2.5 KiB
JavaScript
106 lines
2.5 KiB
JavaScript
// =====================
|
||
// Получаем результат OCR
|
||
// =====================
|
||
const ocrResult = $input.item.json;
|
||
|
||
// =====================
|
||
// pages_data (нормализуем)
|
||
// =====================
|
||
const pagesData = Array.isArray(ocrResult.pages_data) ? ocrResult.pages_data : [];
|
||
|
||
// =====================
|
||
// Сбор текста со страниц
|
||
// =====================
|
||
let fullText = '';
|
||
if (pagesData.length) {
|
||
fullText = pagesData
|
||
.map(p => p.ocr_text || '')
|
||
.filter(t => t.trim() !== '')
|
||
.join('\n\n');
|
||
} else if (ocrResult?.text && String(ocrResult.text).trim() !== '') {
|
||
// Fallback: некоторые OCR возвращают итоговый текст в поле text
|
||
fullText = String(ocrResult.text);
|
||
}
|
||
|
||
// =====================
|
||
// Очистка OCR текста
|
||
// =====================
|
||
function cleanOCRText(text) {
|
||
if (!text) return '';
|
||
|
||
return text
|
||
.replace(/\n{3,}/g, '\n\n')
|
||
.replace(/ {3,}/g, ' ')
|
||
.split('\n')
|
||
.map(l => l.trim())
|
||
.filter(Boolean)
|
||
.join('\n')
|
||
.replace(/[©•»®™✓→←¤…›«""―''◆◇■□●○◎☆★☑☐⚫️×]/g, ' ')
|
||
.replace(/[.,;:]{2,}/g, m => m[0])
|
||
.replace(/\s+/g, ' ')
|
||
.trim();
|
||
}
|
||
|
||
const cleanedText = cleanOCRText(fullText);
|
||
|
||
// =====================
|
||
// NSFW АГРЕГАЦИЯ (ИЗ images_data)
|
||
// =====================
|
||
let documentNSFW = false;
|
||
let nsfwScoreSum = 0;
|
||
let nsfwPagesCount = 0;
|
||
|
||
const imagesData = Array.isArray(ocrResult.images_data) ? ocrResult.images_data : [];
|
||
|
||
imagesData.forEach(img => {
|
||
const pageNSFW = img.nsfw === true;
|
||
const score = Number(img.nsfw_score);
|
||
|
||
if (pageNSFW) {
|
||
documentNSFW = true;
|
||
nsfwPagesCount++;
|
||
}
|
||
|
||
if (!Number.isNaN(score)) {
|
||
nsfwScoreSum += score;
|
||
}
|
||
});
|
||
|
||
const pagesCount =
|
||
Number(ocrResult.pages) ||
|
||
imagesData.length ||
|
||
pagesData.length ||
|
||
0;
|
||
|
||
const nsfwScoreAvg =
|
||
pagesCount > 0
|
||
? nsfwScoreSum / pagesCount
|
||
: 0;
|
||
|
||
// =====================
|
||
// pages_data → JSON для PostgreSQL
|
||
// =====================
|
||
const pagesDataJson = JSON.stringify(pagesData);
|
||
|
||
// =====================
|
||
// RETURN
|
||
// =====================
|
||
return [{
|
||
json: {
|
||
...ocrResult,
|
||
|
||
// тексты
|
||
original_text: fullText,
|
||
cleaned_text: cleanedText,
|
||
|
||
// страницы
|
||
page_count: pagesCount,
|
||
pages_data: pagesData,
|
||
pages_data_json: pagesDataJson,
|
||
|
||
// NSFW (ГЛАВНОЕ)
|
||
document_nsfw: documentNSFW,
|
||
nsfw_pages_count: nsfwPagesCount,
|
||
nsfw_score_avg: Number(nsfwScoreAvg.toFixed(6)),
|
||
}
|
||
}]; |