import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js"; export type ExtractMode = "markdown" | "text"; const READABILITY_MAX_HTML_CHARS = 1_000_000; const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000; let readabilityDepsPromise: | Promise<{ Readability: typeof import("@mozilla/readability").Readability; parseHTML: typeof import("linkedom").parseHTML; }> | undefined; async function loadReadabilityDeps(): Promise<{ Readability: typeof import("@mozilla/readability").Readability; parseHTML: typeof import("linkedom").parseHTML; }> { if (!readabilityDepsPromise) { readabilityDepsPromise = Promise.all([import("@mozilla/readability"), import("linkedom")]).then( ([readability, linkedom]) => ({ Readability: readability.Readability, parseHTML: linkedom.parseHTML, }), ); } try { return await readabilityDepsPromise; } catch (error) { readabilityDepsPromise = undefined; throw error; } } function decodeEntities(value: string): string { return value .replace(/ /gi, " ") .replace(/&/gi, "&") .replace(/"/gi, '"') .replace(/'/gi, "'") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) .replace(/(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); } function stripTags(value: string): string { return decodeEntities(value.replace(/<[^>]+>/g, "")); } function normalizeWhitespace(value: string): string { return value .replace(/\r/g, "") .replace(/[ \t]+\n/g, "\n") .replace(/\n{3,}/g, "\n\n") .replace(/[ \t]{2,}/g, " ") .trim(); } export function htmlToMarkdown(html: string): { text: string; title?: string } { const titleMatch = html.match(/