2026-02-23 00:10:26 +01:00
|
|
|
import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js";
|
|
|
|
|
|
2026-01-18 01:42:40 +00:00
|
|
|
export type ExtractMode = "markdown" | "text";
|
|
|
|
|
|
2026-02-16 01:19:04 +01:00
|
|
|
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
|
|
|
|
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
|
|
|
|
|
|
2026-02-14 01:29:40 +00:00
|
|
|
let readabilityDepsPromise:
|
|
|
|
|
| Promise<{
|
|
|
|
|
Readability: typeof import("@mozilla/readability").Readability;
|
|
|
|
|
parseHTML: typeof import("linkedom").parseHTML;
|
|
|
|
|
}>
|
|
|
|
|
| undefined;
|
|
|
|
|
|
|
|
|
|
async function loadReadabilityDeps(): Promise<{
|
|
|
|
|
Readability: typeof import("@mozilla/readability").Readability;
|
|
|
|
|
parseHTML: typeof import("linkedom").parseHTML;
|
|
|
|
|
}> {
|
|
|
|
|
if (!readabilityDepsPromise) {
|
|
|
|
|
readabilityDepsPromise = Promise.all([import("@mozilla/readability"), import("linkedom")]).then(
|
|
|
|
|
([readability, linkedom]) => ({
|
|
|
|
|
Readability: readability.Readability,
|
|
|
|
|
parseHTML: linkedom.parseHTML,
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
return await readabilityDepsPromise;
|
|
|
|
|
} catch (error) {
|
|
|
|
|
readabilityDepsPromise = undefined;
|
|
|
|
|
throw error;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 01:42:40 +00:00
|
|
|
function decodeEntities(value: string): string {
|
|
|
|
|
return value
|
|
|
|
|
.replace(/ /gi, " ")
|
|
|
|
|
.replace(/&/gi, "&")
|
|
|
|
|
.replace(/"/gi, '"')
|
|
|
|
|
.replace(/'/gi, "'")
|
|
|
|
|
.replace(/</gi, "<")
|
|
|
|
|
.replace(/>/gi, ">")
|
|
|
|
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
|
|
|
|
|
.replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function stripTags(value: string): string {
|
|
|
|
|
return decodeEntities(value.replace(/<[^>]+>/g, ""));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalizeWhitespace(value: string): string {
|
|
|
|
|
return value
|
|
|
|
|
.replace(/\r/g, "")
|
|
|
|
|
.replace(/[ \t]+\n/g, "\n")
|
|
|
|
|
.replace(/\n{3,}/g, "\n\n")
|
|
|
|
|
.replace(/[ \t]{2,}/g, " ")
|
|
|
|
|
.trim();
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 19:24:16 -05:00
|
|
|
export function htmlToMarkdown(html: string): { text: string; title?: string } {
|
2026-01-18 01:42:40 +00:00
|
|
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
|
|
|
const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
|
|
|
|
|
let text = html
|
|
|
|
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
|
|
|
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
|
|
|
.replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
|
|
|
|
|
text = text.replace(/<a\s+[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => {
|
|
|
|
|
const label = normalizeWhitespace(stripTags(body));
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!label) {
|
|
|
|
|
return href;
|
|
|
|
|
}
|
2026-01-18 01:42:40 +00:00
|
|
|
return `[${label}](${href})`;
|
|
|
|
|
});
|
|
|
|
|
text = text.replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => {
|
|
|
|
|
const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10))));
|
|
|
|
|
const label = normalizeWhitespace(stripTags(body));
|
|
|
|
|
return `\n${prefix} ${label}\n`;
|
|
|
|
|
});
|
|
|
|
|
text = text.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, body) => {
|
|
|
|
|
const label = normalizeWhitespace(stripTags(body));
|
|
|
|
|
return label ? `\n- ${label}` : "";
|
|
|
|
|
});
|
|
|
|
|
text = text
|
|
|
|
|
.replace(/<(br|hr)\s*\/?>/gi, "\n")
|
|
|
|
|
.replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n");
|
|
|
|
|
text = stripTags(text);
|
|
|
|
|
text = normalizeWhitespace(text);
|
|
|
|
|
return { text, title };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function markdownToText(markdown: string): string {
|
|
|
|
|
let text = markdown;
|
|
|
|
|
text = text.replace(/!\[[^\]]*]\([^)]+\)/g, "");
|
|
|
|
|
text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1");
|
|
|
|
|
text = text.replace(/```[\s\S]*?```/g, (block) =>
|
|
|
|
|
block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""),
|
|
|
|
|
);
|
|
|
|
|
text = text.replace(/`([^`]+)`/g, "$1");
|
|
|
|
|
text = text.replace(/^#{1,6}\s+/gm, "");
|
|
|
|
|
text = text.replace(/^\s*[-*+]\s+/gm, "");
|
|
|
|
|
text = text.replace(/^\s*\d+\.\s+/gm, "");
|
|
|
|
|
return normalizeWhitespace(text);
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 02:19:35 +00:00
|
|
|
export function truncateText(
|
|
|
|
|
value: string,
|
|
|
|
|
maxChars: number,
|
|
|
|
|
): { text: string; truncated: boolean } {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (value.length <= maxChars) {
|
|
|
|
|
return { text: value, truncated: false };
|
|
|
|
|
}
|
2026-01-18 01:42:40 +00:00
|
|
|
return { text: value.slice(0, maxChars), truncated: true };
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-16 01:19:04 +01:00
|
|
|
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
|
|
|
|
|
// Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
|
|
|
|
|
// Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
|
|
|
|
|
const voidTags = new Set([
|
|
|
|
|
"area",
|
|
|
|
|
"base",
|
|
|
|
|
"br",
|
|
|
|
|
"col",
|
|
|
|
|
"embed",
|
|
|
|
|
"hr",
|
|
|
|
|
"img",
|
|
|
|
|
"input",
|
|
|
|
|
"link",
|
|
|
|
|
"meta",
|
|
|
|
|
"param",
|
|
|
|
|
"source",
|
|
|
|
|
"track",
|
|
|
|
|
"wbr",
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
let depth = 0;
|
|
|
|
|
const len = html.length;
|
|
|
|
|
for (let i = 0; i < len; i++) {
|
|
|
|
|
if (html.charCodeAt(i) !== 60) {
|
|
|
|
|
continue; // '<'
|
|
|
|
|
}
|
|
|
|
|
const next = html.charCodeAt(i + 1);
|
|
|
|
|
if (next === 33 || next === 63) {
|
|
|
|
|
continue; // <! ...> or <? ...>
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let j = i + 1;
|
|
|
|
|
let closing = false;
|
|
|
|
|
if (html.charCodeAt(j) === 47) {
|
|
|
|
|
closing = true;
|
|
|
|
|
j += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while (j < len && html.charCodeAt(j) <= 32) {
|
|
|
|
|
j += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const nameStart = j;
|
|
|
|
|
while (j < len) {
|
|
|
|
|
const c = html.charCodeAt(j);
|
|
|
|
|
const isNameChar =
|
|
|
|
|
(c >= 65 && c <= 90) || // A-Z
|
|
|
|
|
(c >= 97 && c <= 122) || // a-z
|
|
|
|
|
(c >= 48 && c <= 57) || // 0-9
|
|
|
|
|
c === 58 || // :
|
|
|
|
|
c === 45; // -
|
|
|
|
|
if (!isNameChar) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
j += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const tagName = html.slice(nameStart, j).toLowerCase();
|
|
|
|
|
if (!tagName) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (closing) {
|
|
|
|
|
depth = Math.max(0, depth - 1);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (voidTags.has(tagName)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Best-effort self-closing detection: scan a short window for "/>".
|
|
|
|
|
let selfClosing = false;
|
|
|
|
|
for (let k = j; k < len && k < j + 200; k++) {
|
|
|
|
|
const c = html.charCodeAt(k);
|
|
|
|
|
if (c === 62) {
|
|
|
|
|
if (html.charCodeAt(k - 1) === 47) {
|
|
|
|
|
selfClosing = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (selfClosing) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
depth += 1;
|
|
|
|
|
if (depth > maxDepth) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 01:42:40 +00:00
|
|
|
export async function extractReadableContent(params: {
|
|
|
|
|
html: string;
|
|
|
|
|
url: string;
|
|
|
|
|
extractMode: ExtractMode;
|
|
|
|
|
}): Promise<{ text: string; title?: string } | null> {
|
2026-02-23 00:10:26 +01:00
|
|
|
const cleanHtml = await sanitizeHtml(params.html);
|
2026-01-24 02:14:59 +00:00
|
|
|
const fallback = (): { text: string; title?: string } => {
|
2026-02-23 00:10:26 +01:00
|
|
|
const rendered = htmlToMarkdown(cleanHtml);
|
2026-01-24 02:14:59 +00:00
|
|
|
if (params.extractMode === "text") {
|
2026-02-23 00:10:26 +01:00
|
|
|
const text =
|
|
|
|
|
stripInvisibleUnicode(markdownToText(rendered.text)) ||
|
|
|
|
|
stripInvisibleUnicode(normalizeWhitespace(stripTags(cleanHtml)));
|
2026-01-24 02:14:59 +00:00
|
|
|
return { text, title: rendered.title };
|
|
|
|
|
}
|
2026-02-23 00:10:26 +01:00
|
|
|
return { text: stripInvisibleUnicode(rendered.text), title: rendered.title };
|
2026-01-24 02:14:59 +00:00
|
|
|
};
|
2026-02-16 01:19:04 +01:00
|
|
|
if (
|
2026-02-23 00:10:26 +01:00
|
|
|
cleanHtml.length > READABILITY_MAX_HTML_CHARS ||
|
|
|
|
|
exceedsEstimatedHtmlNestingDepth(cleanHtml, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
2026-02-16 01:19:04 +01:00
|
|
|
) {
|
|
|
|
|
return fallback();
|
|
|
|
|
}
|
2026-01-18 01:42:40 +00:00
|
|
|
try {
|
2026-02-14 01:29:40 +00:00
|
|
|
const { Readability, parseHTML } = await loadReadabilityDeps();
|
2026-02-23 00:10:26 +01:00
|
|
|
const { document } = parseHTML(cleanHtml);
|
2026-01-18 01:42:40 +00:00
|
|
|
try {
|
|
|
|
|
(document as { baseURI?: string }).baseURI = params.url;
|
|
|
|
|
} catch {
|
|
|
|
|
// Best-effort base URI for relative links.
|
|
|
|
|
}
|
|
|
|
|
const reader = new Readability(document, { charThreshold: 0 });
|
|
|
|
|
const parsed = reader.parse();
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!parsed?.content) {
|
|
|
|
|
return fallback();
|
|
|
|
|
}
|
2026-01-18 01:42:40 +00:00
|
|
|
const title = parsed.title || undefined;
|
|
|
|
|
if (params.extractMode === "text") {
|
2026-02-23 00:10:26 +01:00
|
|
|
const text = stripInvisibleUnicode(normalizeWhitespace(parsed.textContent ?? ""));
|
2026-01-24 02:14:59 +00:00
|
|
|
return text ? { text, title } : fallback();
|
2026-01-18 01:42:40 +00:00
|
|
|
}
|
|
|
|
|
const rendered = htmlToMarkdown(parsed.content);
|
2026-02-23 00:10:26 +01:00
|
|
|
return { text: stripInvisibleUnicode(rendered.text), title: title ?? rendered.title };
|
2026-01-18 01:42:40 +00:00
|
|
|
} catch {
|
2026-01-24 02:14:59 +00:00
|
|
|
return fallback();
|
2026-01-18 01:42:40 +00:00
|
|
|
}
|
|
|
|
|
}
|