feature: Enhance output options with Google Sheets integration and improve Excel writer functionality

This commit is contained in:
Keith Solomon
2026-05-17 12:05:42 -05:00
parent 379526114c
commit a7cdcf95ae
10 changed files with 375 additions and 14 deletions
+28 -3
View File
@@ -3,6 +3,8 @@ import { writeFile } from 'node:fs/promises';
import { loadConfig } from '../config/config.js';
import { createGmailClient } from '../gmail/client.js';
import { ExcelWriter } from '../output/excel.js';
import { createGoogleSheetsWriter } from '../output/googleSheets.js';
import { OutputWriter } from '../output/sheets.js';
import { runCatalog } from '../run/runCatalog.js';
import { validateDateFilters } from './flags.js';
@@ -54,9 +56,7 @@ export function createProgram(): Command {
.action(async (options) => {
validateDateFilters(options);
const config = await loadConfig(options.config);
const writers = config.output.excel.enabled
? [new ExcelWriter(config.output.excel.path)]
: [];
const writers = await createWriters(config);
const messages =
process.env.NLC_FIXTURE === '1'
? fixtureMessages()
@@ -82,6 +82,31 @@ export function createProgram(): Command {
return program;
}
async function createWriters(
config: Awaited<ReturnType<typeof loadConfig>>
): Promise<OutputWriter[]> {
const writers: OutputWriter[] = [];
if (config.output.excel.enabled) {
writers.push(new ExcelWriter(config.output.excel.path));
}
if (config.output.sheetsApi.enabled) {
if (!config.output.sheetsApi.credentials || !config.output.sheetsApi.token) {
throw new Error('Google Sheets output requires sheets_api credentials and token paths');
}
if (!config.output.sheetsApi.spreadsheetId) {
throw new Error('Google Sheets output requires output.sheets_api.spreadsheet_id');
}
writers.push(
await createGoogleSheetsWriter({
credentials: config.output.sheetsApi.credentials,
token: config.output.sheetsApi.token,
spreadsheetId: config.output.sheetsApi.spreadsheetId
})
);
}
return writers;
}
async function fetchGmailMessages(
config: Awaited<ReturnType<typeof loadConfig>>,
options: { dryRun?: number | boolean; from?: string; to?: string; last?: string }
+9 -1
View File
@@ -10,6 +10,14 @@ import { NewsletterMessage } from '../parsing/types.js';
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
return authorizeGoogleOAuth(credentialsPath, tokenPath, gmailScopes);
}
export async function authorizeGoogleOAuth(
credentialsPath: string,
tokenPath: string,
scopes: string[]
) {
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
const clientConfig = credentials.installed ?? credentials.web;
const oauth = new google.auth.OAuth2(
@@ -22,7 +30,7 @@ export async function authorizeGmail(credentialsPath: string, tokenPath: string)
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
return oauth;
} catch {
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: scopes });
const code = await waitForBrowserCode(url);
const { tokens } = await oauth.getToken(code);
oauth.setCredentials(tokens);
+19 -1
View File
@@ -3,6 +3,16 @@ import { dirname } from 'node:path';
import XLSX from 'xlsx';
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
const contentColumns = [
'Issue Date',
'Category',
'Link URL',
'Title',
'Description',
'Page Title + Meta',
'Also In'
];
export class ExcelWriter implements OutputWriter {
public constructor(private readonly path: string) {}
@@ -14,7 +24,11 @@ export class ExcelWriter implements OutputWriter {
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
for (const [sheet, rows] of grouped) {
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(rows.map(toContentOutputRow), { header: contentColumns }),
sheet
);
}
XLSX.utils.book_append_sheet(
workbook,
@@ -30,3 +44,7 @@ export class ExcelWriter implements OutputWriter {
XLSX.writeFile(workbook, this.path);
}
}
function toContentOutputRow(row: Record<string, unknown>): Record<string, unknown> {
return Object.fromEntries(contentColumns.map((column) => [column, row[column] ?? '']));
}
+88 -6
View File
@@ -1,15 +1,97 @@
import { google } from 'googleapis';
import { CatalogPayload, OutputWriter } from './sheets.js';
import { authorizeGoogleOAuth } from '../gmail/client.js';
import { CatalogPayload, escapeCell, OutputWriter, sanitizeSheetName } from './sheets.js';
const sheetsScopes = ['https://www.googleapis.com/auth/spreadsheets'];
type SheetsClient = ReturnType<typeof google.sheets> | any;
const contentColumns = [
'Issue Date',
'Category',
'Link URL',
'Title',
'Description',
'Page Title + Meta',
'Also In'
];
const sponsorColumns = ['Newsletter', 'Sponsor', 'Link', 'Description'];
const deadColumns = ['URL', 'Status', 'Source', 'Date'];
export class GoogleSheetsWriter implements OutputWriter {
public constructor(
private readonly spreadsheetId: string,
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
private readonly auth: Parameters<typeof google.sheets>[0]['auth'],
private readonly sheetsClient?: SheetsClient
) {}
public async write(_payload: CatalogPayload): Promise<void> {
const sheets = google.sheets({ version: 'v4', auth: this.auth });
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
// Real row append calls are intentionally centralized here; tests use a fake writer.
public async write(payload: CatalogPayload): Promise<void> {
const sheets = this.sheetsClient ?? google.sheets({ version: 'v4', auth: this.auth });
const existing = await this.getExistingSheetNames(sheets);
const grouped = this.groupContentRows(payload.rows);
const desired = [...grouped.keys(), 'Sponsored Links', 'Dead Links'];
const missing = desired.filter((sheet) => !existing.has(sheet));
if (missing.length > 0) {
await sheets.spreadsheets.batchUpdate({
spreadsheetId: this.spreadsheetId,
requestBody: {
requests: missing.map((title) => ({ addSheet: { properties: { title } } }))
}
});
}
for (const [sheet, rows] of grouped) {
await this.appendRows(sheets, sheet, contentColumns, rows);
}
await this.appendRows(sheets, 'Sponsored Links', sponsorColumns, payload.sponsors);
await this.appendRows(sheets, 'Dead Links', deadColumns, payload.deadLinks);
}
private async getExistingSheetNames(sheets: SheetsClient): Promise<Set<string>> {
const spreadsheet = await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
return new Set(
(spreadsheet.data.sheets ?? [])
.map((sheet: any) => sheet.properties?.title)
.filter((title: unknown): title is string => typeof title === 'string')
);
}
private groupContentRows(rows: Record<string, unknown>[]): Map<string, Record<string, unknown>[]> {
const grouped = new Map<string, Record<string, unknown>[]>();
for (const row of rows) {
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'), 100);
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
return grouped;
}
private async appendRows(
sheets: SheetsClient,
sheet: string,
columns: string[],
rows: Record<string, unknown>[]
): Promise<void> {
if (rows.length === 0) {
return;
}
await sheets.spreadsheets.values.append({
spreadsheetId: this.spreadsheetId,
range: `'${sheet.replaceAll("'", "''")}'!A1`,
valueInputOption: 'RAW',
insertDataOption: 'INSERT_ROWS',
requestBody: {
values: [columns, ...rows.map((row) => columns.map((column) => escapeCell(row[column] ?? '')))]
}
});
}
}
export async function createGoogleSheetsWriter(options: {
credentials: string;
token: string;
spreadsheetId: string;
}): Promise<GoogleSheetsWriter> {
const auth = await authorizeGoogleOAuth(options.credentials, options.token, sheetsScopes);
return new GoogleSheetsWriter(options.spreadsheetId, auth);
}
+65 -2
View File
@@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined
return parentPrevious || undefined;
}
type TextToken = { type: 'text'; text: string };
type AnchorToken = { type: 'anchor'; element: any; text: string };
type Token = TextToken | AnchorToken;
function compactText(value: string): string {
return value.replace(/\s+/g, ' ').trim();
}
function textBeforeSponsorMarker(value: string): string {
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
}
function sponsorMarkerText(value: string): string | undefined {
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
}
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
if (node.type === 'text') {
const text = compactText(node.data ?? '');
return text ? [{ type: 'text', text }] : [];
}
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
}
return $(node)
.contents()
.toArray()
.flatMap((child) => blockTokens($, child));
}
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
const block = $(element).closest('p,li,td,div').first();
const tokens = blockTokens($, block.get(0));
const anchorIndex = tokens.findIndex(
(token) => token.type === 'anchor' && token.element === element
);
if (anchorIndex === -1) {
return title;
}
const parts: string[] = [];
const previousText = tokens
.slice(0, anchorIndex)
.reverse()
.find((token): token is TextToken => token.type === 'text')?.text;
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
if (marker) {
parts.push(marker);
}
parts.push(title);
for (const token of tokens.slice(anchorIndex + 1)) {
if (token.type === 'anchor') {
break;
}
parts.push(textBeforeSponsorMarker(token.text));
}
return compactText(parts.join(' '));
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
@@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = {
.toArray()
.map((element) => {
const anchor = $(element);
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
const context = localContext($, element, title);
return {
url,
+13 -1
View File
@@ -35,6 +35,18 @@ function issueDate(date: string): string {
return new Date(date).toISOString().slice(0, 10);
}
function sponsorDescription(linkTitle: string, description: string): string {
return description
.replace(/\b(?:sponsor|sponsored|advertisement|partner)\b/i, '')
.replace(new RegExp(`^\\s*${escapeRegExp(linkTitle)}\\s*(?:[-:–—]|\\s)+`, 'i'), '')
.replace(/\s+/g, ' ')
.trim();
}
function escapeRegExp(value: string): string {
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
const config = normalizeConfig(options.config);
const state = new StateStore(config.stateFile);
@@ -76,7 +88,7 @@ export async function runCatalog(options: RunOptions): Promise<RunSummary> {
Newsletter: newsletterName(message.from),
Sponsor: link.title,
Link: link.normalizedUrl,
Description: link.description ?? ''
Description: sponsorDescription(link.title, link.description ?? '')
});
continue;
}
+4
View File
@@ -35,5 +35,9 @@ describe('ExcelWriter', () => {
const workbook = XLSX.readFile(path);
expect(workbook.SheetNames[0]).toBe('A Very Long Newsletter Name Tha');
expect(workbook.SheetNames[0].length).toBe(31);
const rows = XLSX.utils.sheet_to_json<Record<string, unknown>>(
workbook.Sheets[workbook.SheetNames[0]]
);
expect(rows[0]).not.toHaveProperty('Source Newsletter');
});
});
+79
View File
@@ -0,0 +1,79 @@
import { describe, expect, it } from 'vitest';
import { GoogleSheetsWriter } from '../src/output/googleSheets.js';
describe('GoogleSheetsWriter', () => {
it('creates missing sheets and appends content, sponsor, and dead-link rows', async () => {
const calls: unknown[] = [];
const sheets = {
spreadsheets: {
get: async () => ({
data: { sheets: [{ properties: { title: 'Sponsored Links' } }] }
}),
batchUpdate: async (request: unknown) => {
calls.push(request);
},
values: {
append: async (request: unknown) => {
calls.push(request);
}
}
}
};
await new GoogleSheetsWriter('sheet-1', undefined, sheets).write({
rows: [
{
'Source Newsletter': 'A Very Long Newsletter Name That Is Fine In Google Sheets',
Title: '=Formula',
'Link URL': 'https://example.com'
}
],
sponsors: [{ Newsletter: 'Weekly', Sponsor: 'Acme', Link: 'https://sponsor.example' }],
deadLinks: [{ URL: 'https://dead.example', Status: '404' }]
});
expect(calls[0]).toMatchObject({
spreadsheetId: 'sheet-1',
requestBody: {
requests: [
{
addSheet: {
properties: { title: 'A Very Long Newsletter Name That Is Fine In Google Sheets' }
}
},
{ addSheet: { properties: { title: 'Dead Links' } } }
]
}
});
expect(calls).toContainEqual(
expect.objectContaining({
spreadsheetId: 'sheet-1',
range: "'A Very Long Newsletter Name That Is Fine In Google Sheets'!A1",
requestBody: {
values: [
[
'Issue Date',
'Category',
'Link URL',
'Title',
'Description',
'Page Title + Meta',
'Also In'
],
['', '', 'https://example.com', "'=Formula", '', '', '']
]
}
})
);
expect(calls).toContainEqual(
expect.objectContaining({
range: "'Sponsored Links'!A1"
})
);
expect(calls).toContainEqual(
expect.objectContaining({
range: "'Dead Links'!A1"
})
);
});
});
+25
View File
@@ -1,4 +1,5 @@
import { describe, expect, it } from 'vitest';
import { genericParser } from '../src/parsing/generic.js';
import { selectParser } from '../src/parsing/plugins.js';
describe('parser plugin selection', () => {
@@ -11,3 +12,27 @@ describe('parser plugin selection', () => {
).toBe('generic');
});
});
describe('generic parser', () => {
it('keeps descriptions local to each link when many links share a container', () => {
const links = genericParser.parse({
html: `
<div>
<h2>CSS & HTML Tools</h2>
<a href="https://cascade.example">Cascade</a> - CSS property icons.
<a href="https://frames.example">Fancy Frames</a> - Decorative border generator.
SPONSORED
<a href="https://flexboxle.example">flexboxle</a> - A daily puzzle game to master CSS Flexbox.
<a href="https://types.example">Typescale AI</a> - A typescale generator.
</div>
`
});
expect(links.map((link) => link.description)).toEqual([
'Cascade - CSS property icons.',
'Fancy Frames - Decorative border generator.',
'SPONSORED flexboxle - A daily puzzle game to master CSS Flexbox.',
'Typescale AI - A typescale generator.'
]);
});
});
+45
View File
@@ -41,4 +41,49 @@ describe('run orchestration', () => {
expect(result.linksExtracted).toBe(1);
expect(writes).toHaveLength(0);
});
it('only sends locally marked sponsored links to the sponsored output', async () => {
const stateFile = join(dir, 'state.json');
const writes: any[] = [];
await runCatalog({
config: {
gmail: { folder: 'Newsletters' },
output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
stateFile
},
messages: [
{
id: 'msg-1',
messageId: '<msg-1>',
from: 'Web Tools Weekly <w@example.com>',
date: '2026-05-16T00:00:00.000Z',
html: `
<div>
<a href="https://cascade.example">Cascade</a> - CSS property icons.
<a href="https://frames.example">Fancy Frames</a> - Decorative borders.
SPONSORED
<a href="https://flexboxle.example">flexboxle</a> - A daily puzzle game.
<a href="https://types.example">Typescale AI</a> - A typescale generator.
</div>
`
}
],
writers: [{ write: async (payload) => writes.push(payload) }]
});
expect(writes[0].sponsors).toEqual([
{
Newsletter: 'Web Tools Weekly',
Sponsor: 'flexboxle',
Link: 'https://flexboxle.example/',
Description: 'A daily puzzle game.'
}
]);
expect(writes[0].rows.map((row: any) => row.Title)).toEqual([
'Cascade',
'Fancy Frames',
'Typescale AI'
]);
});
});