✨ feature: Enhance output options with Google Sheets integration and improve Excel writer functionality
This commit is contained in:
+65
-2
@@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined
|
||||
return parentPrevious || undefined;
|
||||
}
|
||||
|
||||
type TextToken = { type: 'text'; text: string };
|
||||
type AnchorToken = { type: 'anchor'; element: any; text: string };
|
||||
type Token = TextToken | AnchorToken;
|
||||
|
||||
function compactText(value: string): string {
|
||||
return value.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function textBeforeSponsorMarker(value: string): string {
|
||||
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
|
||||
}
|
||||
|
||||
function sponsorMarkerText(value: string): string | undefined {
|
||||
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
|
||||
}
|
||||
|
||||
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
|
||||
if (node.type === 'text') {
|
||||
const text = compactText(node.data ?? '');
|
||||
return text ? [{ type: 'text', text }] : [];
|
||||
}
|
||||
|
||||
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
|
||||
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
|
||||
}
|
||||
|
||||
return $(node)
|
||||
.contents()
|
||||
.toArray()
|
||||
.flatMap((child) => blockTokens($, child));
|
||||
}
|
||||
|
||||
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
|
||||
const block = $(element).closest('p,li,td,div').first();
|
||||
const tokens = blockTokens($, block.get(0));
|
||||
const anchorIndex = tokens.findIndex(
|
||||
(token) => token.type === 'anchor' && token.element === element
|
||||
);
|
||||
if (anchorIndex === -1) {
|
||||
return title;
|
||||
}
|
||||
|
||||
const parts: string[] = [];
|
||||
const previousText = tokens
|
||||
.slice(0, anchorIndex)
|
||||
.reverse()
|
||||
.find((token): token is TextToken => token.type === 'text')?.text;
|
||||
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
|
||||
if (marker) {
|
||||
parts.push(marker);
|
||||
}
|
||||
parts.push(title);
|
||||
|
||||
for (const token of tokens.slice(anchorIndex + 1)) {
|
||||
if (token.type === 'anchor') {
|
||||
break;
|
||||
}
|
||||
parts.push(textBeforeSponsorMarker(token.text));
|
||||
}
|
||||
|
||||
return compactText(parts.join(' '));
|
||||
}
|
||||
|
||||
export const genericParser: ParserPlugin = {
|
||||
name: 'generic',
|
||||
matches: () => true,
|
||||
@@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = {
|
||||
.toArray()
|
||||
.map((element) => {
|
||||
const anchor = $(element);
|
||||
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
|
||||
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
|
||||
const url = anchor.attr('href') ?? '';
|
||||
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
|
||||
const context = localContext($, element, title);
|
||||
|
||||
return {
|
||||
url,
|
||||
|
||||
Reference in New Issue
Block a user