Files
Newsletter-Link-Catalog/src/parsing/generic.ts
T
2026-05-17 14:05:25 -05:00

114 lines
3.2 KiB
TypeScript

import * as cheerio from 'cheerio';
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
if (previous) {
return previous;
}
const parentPrevious = $(element)
.parent()
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
.first()
.text()
.trim();
return parentPrevious || undefined;
}
type TextToken = { type: 'text'; text: string };
type AnchorToken = { type: 'anchor'; element: any; text: string };
type Token = TextToken | AnchorToken;
function compactText(value: string): string {
return value.replace(/\s+/g, ' ').trim();
}
function textBeforeSponsorMarker(value: string): string {
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
}
function sponsorMarkerText(value: string): string | undefined {
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
}
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
if (!node) {
return [];
}
if (node.type === 'text') {
const text = compactText(node.data ?? '');
return text ? [{ type: 'text', text }] : [];
}
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
}
return $(node)
.contents()
.toArray()
.flatMap((child) => blockTokens($, child));
}
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
const block = $(element).closest('p,li,td,div').first();
if (block.length === 0) {
return title;
}
const tokens = blockTokens($, block.get(0));
const anchorIndex = tokens.findIndex(
(token) => token.type === 'anchor' && token.element === element
);
if (anchorIndex === -1) {
return title;
}
const parts: string[] = [];
const previousText = tokens
.slice(0, anchorIndex)
.reverse()
.find((token): token is TextToken => token.type === 'text')?.text;
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
if (marker) {
parts.push(marker);
}
parts.push(title);
for (const token of tokens.slice(anchorIndex + 1)) {
if (token.type === 'anchor') {
break;
}
parts.push(textBeforeSponsorMarker(token.text));
}
return compactText(parts.join(' '));
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
parse(input: ParserInput): ExtractedLink[] {
const $ = cheerio.load(input.html);
return $('a[href]')
.toArray()
.map((element) => {
const anchor = $(element);
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = localContext($, element, title);
return {
url,
title,
description: context && context !== title ? context : '',
sourceText: title,
section: nearestSection($, element),
context
};
})
.filter((link) => Boolean(link.url));
}
};