feature: Enhance output options with Google Sheets integration and improve Excel writer functionality

This commit is contained in:
Keith Solomon
2026-05-17 12:05:42 -05:00
parent 379526114c
commit a7cdcf95ae
10 changed files with 375 additions and 14 deletions
+65 -2
View File
@@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined
return parentPrevious || undefined;
}
type TextToken = { type: 'text'; text: string };
type AnchorToken = { type: 'anchor'; element: any; text: string };
type Token = TextToken | AnchorToken;
function compactText(value: string): string {
return value.replace(/\s+/g, ' ').trim();
}
function textBeforeSponsorMarker(value: string): string {
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
}
function sponsorMarkerText(value: string): string | undefined {
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
}
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
if (node.type === 'text') {
const text = compactText(node.data ?? '');
return text ? [{ type: 'text', text }] : [];
}
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
}
return $(node)
.contents()
.toArray()
.flatMap((child) => blockTokens($, child));
}
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
const block = $(element).closest('p,li,td,div').first();
const tokens = blockTokens($, block.get(0));
const anchorIndex = tokens.findIndex(
(token) => token.type === 'anchor' && token.element === element
);
if (anchorIndex === -1) {
return title;
}
const parts: string[] = [];
const previousText = tokens
.slice(0, anchorIndex)
.reverse()
.find((token): token is TextToken => token.type === 'text')?.text;
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
if (marker) {
parts.push(marker);
}
parts.push(title);
for (const token of tokens.slice(anchorIndex + 1)) {
if (token.type === 'anchor') {
break;
}
parts.push(textBeforeSponsorMarker(token.text));
}
return compactText(parts.join(' '));
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
@@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = {
.toArray()
.map((element) => {
const anchor = $(element);
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
const context = localContext($, element, title);
return {
url,