feature: Enhance output options with Google Sheets integration and improve Excel writer functionality

This commit is contained in:
Keith Solomon
2026-05-17 12:05:42 -05:00
parent 379526114c
commit a7cdcf95ae
10 changed files with 375 additions and 14 deletions
+28 -3
View File
@@ -3,6 +3,8 @@ import { writeFile } from 'node:fs/promises';
import { loadConfig } from '../config/config.js';
import { createGmailClient } from '../gmail/client.js';
import { ExcelWriter } from '../output/excel.js';
import { createGoogleSheetsWriter } from '../output/googleSheets.js';
import { OutputWriter } from '../output/sheets.js';
import { runCatalog } from '../run/runCatalog.js';
import { validateDateFilters } from './flags.js';
@@ -54,9 +56,7 @@ export function createProgram(): Command {
.action(async (options) => {
validateDateFilters(options);
const config = await loadConfig(options.config);
const writers = config.output.excel.enabled
? [new ExcelWriter(config.output.excel.path)]
: [];
const writers = await createWriters(config);
const messages =
process.env.NLC_FIXTURE === '1'
? fixtureMessages()
@@ -82,6 +82,31 @@ export function createProgram(): Command {
return program;
}
async function createWriters(
config: Awaited<ReturnType<typeof loadConfig>>
): Promise<OutputWriter[]> {
const writers: OutputWriter[] = [];
if (config.output.excel.enabled) {
writers.push(new ExcelWriter(config.output.excel.path));
}
if (config.output.sheetsApi.enabled) {
if (!config.output.sheetsApi.credentials || !config.output.sheetsApi.token) {
throw new Error('Google Sheets output requires sheets_api credentials and token paths');
}
if (!config.output.sheetsApi.spreadsheetId) {
throw new Error('Google Sheets output requires output.sheets_api.spreadsheet_id');
}
writers.push(
await createGoogleSheetsWriter({
credentials: config.output.sheetsApi.credentials,
token: config.output.sheetsApi.token,
spreadsheetId: config.output.sheetsApi.spreadsheetId
})
);
}
return writers;
}
async function fetchGmailMessages(
config: Awaited<ReturnType<typeof loadConfig>>,
options: { dryRun?: number | boolean; from?: string; to?: string; last?: string }
+9 -1
View File
@@ -10,6 +10,14 @@ import { NewsletterMessage } from '../parsing/types.js';
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
return authorizeGoogleOAuth(credentialsPath, tokenPath, gmailScopes);
}
export async function authorizeGoogleOAuth(
credentialsPath: string,
tokenPath: string,
scopes: string[]
) {
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
const clientConfig = credentials.installed ?? credentials.web;
const oauth = new google.auth.OAuth2(
@@ -22,7 +30,7 @@ export async function authorizeGmail(credentialsPath: string, tokenPath: string)
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
return oauth;
} catch {
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: scopes });
const code = await waitForBrowserCode(url);
const { tokens } = await oauth.getToken(code);
oauth.setCredentials(tokens);
+19 -1
View File
@@ -3,6 +3,16 @@ import { dirname } from 'node:path';
import XLSX from 'xlsx';
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
const contentColumns = [
'Issue Date',
'Category',
'Link URL',
'Title',
'Description',
'Page Title + Meta',
'Also In'
];
export class ExcelWriter implements OutputWriter {
public constructor(private readonly path: string) {}
@@ -14,7 +24,11 @@ export class ExcelWriter implements OutputWriter {
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
for (const [sheet, rows] of grouped) {
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(rows.map(toContentOutputRow), { header: contentColumns }),
sheet
);
}
XLSX.utils.book_append_sheet(
workbook,
@@ -30,3 +44,7 @@ export class ExcelWriter implements OutputWriter {
XLSX.writeFile(workbook, this.path);
}
}
function toContentOutputRow(row: Record<string, unknown>): Record<string, unknown> {
return Object.fromEntries(contentColumns.map((column) => [column, row[column] ?? '']));
}
+88 -6
View File
@@ -1,15 +1,97 @@
import { google } from 'googleapis';
import { CatalogPayload, OutputWriter } from './sheets.js';
import { authorizeGoogleOAuth } from '../gmail/client.js';
import { CatalogPayload, escapeCell, OutputWriter, sanitizeSheetName } from './sheets.js';
const sheetsScopes = ['https://www.googleapis.com/auth/spreadsheets'];
type SheetsClient = ReturnType<typeof google.sheets> | any;
const contentColumns = [
'Issue Date',
'Category',
'Link URL',
'Title',
'Description',
'Page Title + Meta',
'Also In'
];
const sponsorColumns = ['Newsletter', 'Sponsor', 'Link', 'Description'];
const deadColumns = ['URL', 'Status', 'Source', 'Date'];
export class GoogleSheetsWriter implements OutputWriter {
public constructor(
private readonly spreadsheetId: string,
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
private readonly auth: Parameters<typeof google.sheets>[0]['auth'],
private readonly sheetsClient?: SheetsClient
) {}
public async write(_payload: CatalogPayload): Promise<void> {
const sheets = google.sheets({ version: 'v4', auth: this.auth });
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
// Real row append calls are intentionally centralized here; tests use a fake writer.
public async write(payload: CatalogPayload): Promise<void> {
const sheets = this.sheetsClient ?? google.sheets({ version: 'v4', auth: this.auth });
const existing = await this.getExistingSheetNames(sheets);
const grouped = this.groupContentRows(payload.rows);
const desired = [...grouped.keys(), 'Sponsored Links', 'Dead Links'];
const missing = desired.filter((sheet) => !existing.has(sheet));
if (missing.length > 0) {
await sheets.spreadsheets.batchUpdate({
spreadsheetId: this.spreadsheetId,
requestBody: {
requests: missing.map((title) => ({ addSheet: { properties: { title } } }))
}
});
}
for (const [sheet, rows] of grouped) {
await this.appendRows(sheets, sheet, contentColumns, rows);
}
await this.appendRows(sheets, 'Sponsored Links', sponsorColumns, payload.sponsors);
await this.appendRows(sheets, 'Dead Links', deadColumns, payload.deadLinks);
}
private async getExistingSheetNames(sheets: SheetsClient): Promise<Set<string>> {
const spreadsheet = await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
return new Set(
(spreadsheet.data.sheets ?? [])
.map((sheet: any) => sheet.properties?.title)
.filter((title: unknown): title is string => typeof title === 'string')
);
}
private groupContentRows(rows: Record<string, unknown>[]): Map<string, Record<string, unknown>[]> {
const grouped = new Map<string, Record<string, unknown>[]>();
for (const row of rows) {
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'), 100);
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
return grouped;
}
private async appendRows(
sheets: SheetsClient,
sheet: string,
columns: string[],
rows: Record<string, unknown>[]
): Promise<void> {
if (rows.length === 0) {
return;
}
await sheets.spreadsheets.values.append({
spreadsheetId: this.spreadsheetId,
range: `'${sheet.replaceAll("'", "''")}'!A1`,
valueInputOption: 'RAW',
insertDataOption: 'INSERT_ROWS',
requestBody: {
values: [columns, ...rows.map((row) => columns.map((column) => escapeCell(row[column] ?? '')))]
}
});
}
}
export async function createGoogleSheetsWriter(options: {
credentials: string;
token: string;
spreadsheetId: string;
}): Promise<GoogleSheetsWriter> {
const auth = await authorizeGoogleOAuth(options.credentials, options.token, sheetsScopes);
return new GoogleSheetsWriter(options.spreadsheetId, auth);
}
+65 -2
View File
@@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined
return parentPrevious || undefined;
}
type TextToken = { type: 'text'; text: string };
type AnchorToken = { type: 'anchor'; element: any; text: string };
type Token = TextToken | AnchorToken;
function compactText(value: string): string {
return value.replace(/\s+/g, ' ').trim();
}
function textBeforeSponsorMarker(value: string): string {
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
}
function sponsorMarkerText(value: string): string | undefined {
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
}
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
if (node.type === 'text') {
const text = compactText(node.data ?? '');
return text ? [{ type: 'text', text }] : [];
}
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
}
return $(node)
.contents()
.toArray()
.flatMap((child) => blockTokens($, child));
}
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
const block = $(element).closest('p,li,td,div').first();
const tokens = blockTokens($, block.get(0));
const anchorIndex = tokens.findIndex(
(token) => token.type === 'anchor' && token.element === element
);
if (anchorIndex === -1) {
return title;
}
const parts: string[] = [];
const previousText = tokens
.slice(0, anchorIndex)
.reverse()
.find((token): token is TextToken => token.type === 'text')?.text;
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
if (marker) {
parts.push(marker);
}
parts.push(title);
for (const token of tokens.slice(anchorIndex + 1)) {
if (token.type === 'anchor') {
break;
}
parts.push(textBeforeSponsorMarker(token.text));
}
return compactText(parts.join(' '));
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
@@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = {
.toArray()
.map((element) => {
const anchor = $(element);
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
const context = localContext($, element, title);
return {
url,
+13 -1
View File
@@ -35,6 +35,18 @@ function issueDate(date: string): string {
return new Date(date).toISOString().slice(0, 10);
}
function sponsorDescription(linkTitle: string, description: string): string {
return description
.replace(/\b(?:sponsor|sponsored|advertisement|partner)\b/i, '')
.replace(new RegExp(`^\\s*${escapeRegExp(linkTitle)}\\s*(?:[-:–—]|\\s)+`, 'i'), '')
.replace(/\s+/g, ' ')
.trim();
}
function escapeRegExp(value: string): string {
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
const config = normalizeConfig(options.config);
const state = new StateStore(config.stateFile);
@@ -76,7 +88,7 @@ export async function runCatalog(options: RunOptions): Promise<RunSummary> {
Newsletter: newsletterName(message.from),
Sponsor: link.title,
Link: link.normalizedUrl,
Description: link.description ?? ''
Description: sponsorDescription(link.title, link.description ?? '')
});
continue;
}