import * as cheerio from 'cheerio'; import { ExtractedLink, ParserInput, ParserPlugin } from './types.js'; function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined { const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim(); if (previous) { return previous; } const parentPrevious = $(element) .parent() .prevAll('h1,h2,h3,h4,h5,h6,p,tr') .first() .text() .trim(); return parentPrevious || undefined; } type TextToken = { type: 'text'; text: string }; type AnchorToken = { type: 'anchor'; element: any; text: string }; type Token = TextToken | AnchorToken; function compactText(value: string): string { return value.replace(/\s+/g, ' ').trim(); } function textBeforeSponsorMarker(value: string): string { return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value); } function sponsorMarkerText(value: string): string | undefined { return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase(); } function blockTokens($: cheerio.CheerioAPI, node: any): Token[] { if (!node) { return []; } if (node.type === 'text') { const text = compactText(node.data ?? ''); return text ? [{ type: 'text', text }] : []; } if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) { return [{ type: 'anchor', element: node, text: compactText($(node).text()) }]; } return $(node) .contents() .toArray() .flatMap((child) => blockTokens($, child)); } function localContext($: cheerio.CheerioAPI, element: any, title: string): string { const block = $(element).closest('p,li,td,div').first(); if (block.length === 0) { return title; } const tokens = blockTokens($, block.get(0)); const anchorIndex = tokens.findIndex( (token) => token.type === 'anchor' && token.element === element ); if (anchorIndex === -1) { return title; } const parts: string[] = []; const previousText = tokens .slice(0, anchorIndex) .reverse() .find((token): token is TextToken => token.type === 'text')?.text; const marker = previousText ? sponsorMarkerText(previousText) : undefined; if (marker) { parts.push(marker); } parts.push(title); for (const token of tokens.slice(anchorIndex + 1)) { if (token.type === 'anchor') { break; } parts.push(textBeforeSponsorMarker(token.text)); } return compactText(parts.join(' ')); } export const genericParser: ParserPlugin = { name: 'generic', matches: () => true, parse(input: ParserInput): ExtractedLink[] { const $ = cheerio.load(input.html); return $('a[href]') .toArray() .map((element) => { const anchor = $(element); const title = compactText(anchor.text()) || anchor.attr('aria-label') || ''; const url = anchor.attr('href') ?? ''; const context = localContext($, element, title); return { url, title, description: context && context !== title ? context : '', sourceText: title, section: nearestSection($, element), context }; }) .filter((link) => Boolean(link.url)); } };