114 lines
3.2 KiB
TypeScript
114 lines
3.2 KiB
TypeScript
import * as cheerio from 'cheerio';
|
|
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
|
|
|
|
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
|
|
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
|
|
if (previous) {
|
|
return previous;
|
|
}
|
|
const parentPrevious = $(element)
|
|
.parent()
|
|
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
|
|
.first()
|
|
.text()
|
|
.trim();
|
|
return parentPrevious || undefined;
|
|
}
|
|
|
|
type TextToken = { type: 'text'; text: string };
|
|
type AnchorToken = { type: 'anchor'; element: any; text: string };
|
|
type Token = TextToken | AnchorToken;
|
|
|
|
function compactText(value: string): string {
|
|
return value.replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function textBeforeSponsorMarker(value: string): string {
|
|
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
|
|
}
|
|
|
|
function sponsorMarkerText(value: string): string | undefined {
|
|
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
|
|
}
|
|
|
|
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
|
|
if (!node) {
|
|
return [];
|
|
}
|
|
|
|
if (node.type === 'text') {
|
|
const text = compactText(node.data ?? '');
|
|
return text ? [{ type: 'text', text }] : [];
|
|
}
|
|
|
|
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
|
|
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
|
|
}
|
|
|
|
return $(node)
|
|
.contents()
|
|
.toArray()
|
|
.flatMap((child) => blockTokens($, child));
|
|
}
|
|
|
|
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
|
|
const block = $(element).closest('p,li,td,div').first();
|
|
if (block.length === 0) {
|
|
return title;
|
|
}
|
|
|
|
const tokens = blockTokens($, block.get(0));
|
|
const anchorIndex = tokens.findIndex(
|
|
(token) => token.type === 'anchor' && token.element === element
|
|
);
|
|
if (anchorIndex === -1) {
|
|
return title;
|
|
}
|
|
|
|
const parts: string[] = [];
|
|
const previousText = tokens
|
|
.slice(0, anchorIndex)
|
|
.reverse()
|
|
.find((token): token is TextToken => token.type === 'text')?.text;
|
|
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
|
|
if (marker) {
|
|
parts.push(marker);
|
|
}
|
|
parts.push(title);
|
|
|
|
for (const token of tokens.slice(anchorIndex + 1)) {
|
|
if (token.type === 'anchor') {
|
|
break;
|
|
}
|
|
parts.push(textBeforeSponsorMarker(token.text));
|
|
}
|
|
|
|
return compactText(parts.join(' '));
|
|
}
|
|
|
|
export const genericParser: ParserPlugin = {
|
|
name: 'generic',
|
|
matches: () => true,
|
|
parse(input: ParserInput): ExtractedLink[] {
|
|
const $ = cheerio.load(input.html);
|
|
return $('a[href]')
|
|
.toArray()
|
|
.map((element) => {
|
|
const anchor = $(element);
|
|
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
|
|
const url = anchor.attr('href') ?? '';
|
|
const context = localContext($, element, title);
|
|
|
|
return {
|
|
url,
|
|
title,
|
|
description: context && context !== title ? context : '',
|
|
sourceText: title,
|
|
section: nearestSection($, element),
|
|
context
|
|
};
|
|
})
|
|
.filter((link) => Boolean(link.url));
|
|
}
|
|
};
|