✨feature: First push to git
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
|
||||
|
||||
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
|
||||
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
|
||||
if (previous) {
|
||||
return previous;
|
||||
}
|
||||
const parentPrevious = $(element)
|
||||
.parent()
|
||||
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
|
||||
.first()
|
||||
.text()
|
||||
.trim();
|
||||
return parentPrevious || undefined;
|
||||
}
|
||||
|
||||
export const genericParser: ParserPlugin = {
|
||||
name: 'generic',
|
||||
matches: () => true,
|
||||
parse(input: ParserInput): ExtractedLink[] {
|
||||
const $ = cheerio.load(input.html);
|
||||
return $('a[href]')
|
||||
.toArray()
|
||||
.map((element) => {
|
||||
const anchor = $(element);
|
||||
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
|
||||
const url = anchor.attr('href') ?? '';
|
||||
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
description: context && context !== title ? context : '',
|
||||
sourceText: title,
|
||||
section: nearestSection($, element),
|
||||
context
|
||||
};
|
||||
})
|
||||
.filter((link) => Boolean(link.url));
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user