diff --git a/src/cli/program.ts b/src/cli/program.ts index d343e48..214e9b8 100644 --- a/src/cli/program.ts +++ b/src/cli/program.ts @@ -3,6 +3,8 @@ import { writeFile } from 'node:fs/promises'; import { loadConfig } from '../config/config.js'; import { createGmailClient } from '../gmail/client.js'; import { ExcelWriter } from '../output/excel.js'; +import { createGoogleSheetsWriter } from '../output/googleSheets.js'; +import { OutputWriter } from '../output/sheets.js'; import { runCatalog } from '../run/runCatalog.js'; import { validateDateFilters } from './flags.js'; @@ -54,9 +56,7 @@ export function createProgram(): Command { .action(async (options) => { validateDateFilters(options); const config = await loadConfig(options.config); - const writers = config.output.excel.enabled - ? [new ExcelWriter(config.output.excel.path)] - : []; + const writers = await createWriters(config); const messages = process.env.NLC_FIXTURE === '1' ? fixtureMessages() @@ -82,6 +82,31 @@ export function createProgram(): Command { return program; } +async function createWriters( + config: Awaited> +): Promise { + const writers: OutputWriter[] = []; + if (config.output.excel.enabled) { + writers.push(new ExcelWriter(config.output.excel.path)); + } + if (config.output.sheetsApi.enabled) { + if (!config.output.sheetsApi.credentials || !config.output.sheetsApi.token) { + throw new Error('Google Sheets output requires sheets_api credentials and token paths'); + } + if (!config.output.sheetsApi.spreadsheetId) { + throw new Error('Google Sheets output requires output.sheets_api.spreadsheet_id'); + } + writers.push( + await createGoogleSheetsWriter({ + credentials: config.output.sheetsApi.credentials, + token: config.output.sheetsApi.token, + spreadsheetId: config.output.sheetsApi.spreadsheetId + }) + ); + } + return writers; +} + async function fetchGmailMessages( config: Awaited>, options: { dryRun?: number | boolean; from?: string; to?: string; last?: string } diff --git a/src/gmail/client.ts b/src/gmail/client.ts index 8cb526a..f18ee16 100644 --- a/src/gmail/client.ts +++ b/src/gmail/client.ts @@ -10,6 +10,14 @@ import { NewsletterMessage } from '../parsing/types.js'; const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly']; export async function authorizeGmail(credentialsPath: string, tokenPath: string) { + return authorizeGoogleOAuth(credentialsPath, tokenPath, gmailScopes); +} + +export async function authorizeGoogleOAuth( + credentialsPath: string, + tokenPath: string, + scopes: string[] +) { const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8')); const clientConfig = credentials.installed ?? credentials.web; const oauth = new google.auth.OAuth2( @@ -22,7 +30,7 @@ export async function authorizeGmail(credentialsPath: string, tokenPath: string) oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8'))); return oauth; } catch { - const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes }); + const url = oauth.generateAuthUrl({ access_type: 'offline', scope: scopes }); const code = await waitForBrowserCode(url); const { tokens } = await oauth.getToken(code); oauth.setCredentials(tokens); diff --git a/src/output/excel.ts b/src/output/excel.ts index 524b256..2f028e4 100644 --- a/src/output/excel.ts +++ b/src/output/excel.ts @@ -3,6 +3,16 @@ import { dirname } from 'node:path'; import XLSX from 'xlsx'; import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js'; +const contentColumns = [ + 'Issue Date', + 'Category', + 'Link URL', + 'Title', + 'Description', + 'Page Title + Meta', + 'Also In' +]; + export class ExcelWriter implements OutputWriter { public constructor(private readonly path: string) {} @@ -14,7 +24,11 @@ export class ExcelWriter implements OutputWriter { grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]); } for (const [sheet, rows] of grouped) { - XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet); + XLSX.utils.book_append_sheet( + workbook, + XLSX.utils.json_to_sheet(rows.map(toContentOutputRow), { header: contentColumns }), + sheet + ); } XLSX.utils.book_append_sheet( workbook, @@ -30,3 +44,7 @@ export class ExcelWriter implements OutputWriter { XLSX.writeFile(workbook, this.path); } } + +function toContentOutputRow(row: Record): Record { + return Object.fromEntries(contentColumns.map((column) => [column, row[column] ?? ''])); +} diff --git a/src/output/googleSheets.ts b/src/output/googleSheets.ts index afd9477..f588785 100644 --- a/src/output/googleSheets.ts +++ b/src/output/googleSheets.ts @@ -1,15 +1,97 @@ import { google } from 'googleapis'; -import { CatalogPayload, OutputWriter } from './sheets.js'; +import { authorizeGoogleOAuth } from '../gmail/client.js'; +import { CatalogPayload, escapeCell, OutputWriter, sanitizeSheetName } from './sheets.js'; + +const sheetsScopes = ['https://www.googleapis.com/auth/spreadsheets']; +type SheetsClient = ReturnType | any; + +const contentColumns = [ + 'Issue Date', + 'Category', + 'Link URL', + 'Title', + 'Description', + 'Page Title + Meta', + 'Also In' +]; +const sponsorColumns = ['Newsletter', 'Sponsor', 'Link', 'Description']; +const deadColumns = ['URL', 'Status', 'Source', 'Date']; export class GoogleSheetsWriter implements OutputWriter { public constructor( private readonly spreadsheetId: string, - private readonly auth: Parameters[0]['auth'] + private readonly auth: Parameters[0]['auth'], + private readonly sheetsClient?: SheetsClient ) {} - public async write(_payload: CatalogPayload): Promise { - const sheets = google.sheets({ version: 'v4', auth: this.auth }); - await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId }); - // Real row append calls are intentionally centralized here; tests use a fake writer. + public async write(payload: CatalogPayload): Promise { + const sheets = this.sheetsClient ?? google.sheets({ version: 'v4', auth: this.auth }); + const existing = await this.getExistingSheetNames(sheets); + const grouped = this.groupContentRows(payload.rows); + const desired = [...grouped.keys(), 'Sponsored Links', 'Dead Links']; + const missing = desired.filter((sheet) => !existing.has(sheet)); + + if (missing.length > 0) { + await sheets.spreadsheets.batchUpdate({ + spreadsheetId: this.spreadsheetId, + requestBody: { + requests: missing.map((title) => ({ addSheet: { properties: { title } } })) + } + }); + } + + for (const [sheet, rows] of grouped) { + await this.appendRows(sheets, sheet, contentColumns, rows); + } + await this.appendRows(sheets, 'Sponsored Links', sponsorColumns, payload.sponsors); + await this.appendRows(sheets, 'Dead Links', deadColumns, payload.deadLinks); + } + + private async getExistingSheetNames(sheets: SheetsClient): Promise> { + const spreadsheet = await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId }); + return new Set( + (spreadsheet.data.sheets ?? []) + .map((sheet: any) => sheet.properties?.title) + .filter((title: unknown): title is string => typeof title === 'string') + ); + } + + private groupContentRows(rows: Record[]): Map[]> { + const grouped = new Map[]>(); + for (const row of rows) { + const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'), 100); + grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]); + } + return grouped; + } + + private async appendRows( + sheets: SheetsClient, + sheet: string, + columns: string[], + rows: Record[] + ): Promise { + if (rows.length === 0) { + return; + } + + await sheets.spreadsheets.values.append({ + spreadsheetId: this.spreadsheetId, + range: `'${sheet.replaceAll("'", "''")}'!A1`, + valueInputOption: 'RAW', + insertDataOption: 'INSERT_ROWS', + requestBody: { + values: [columns, ...rows.map((row) => columns.map((column) => escapeCell(row[column] ?? '')))] + } + }); } } + +export async function createGoogleSheetsWriter(options: { + credentials: string; + token: string; + spreadsheetId: string; +}): Promise { + const auth = await authorizeGoogleOAuth(options.credentials, options.token, sheetsScopes); + return new GoogleSheetsWriter(options.spreadsheetId, auth); +} diff --git a/src/parsing/generic.ts b/src/parsing/generic.ts index b75dd40..0531b79 100644 --- a/src/parsing/generic.ts +++ b/src/parsing/generic.ts @@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined return parentPrevious || undefined; } +type TextToken = { type: 'text'; text: string }; +type AnchorToken = { type: 'anchor'; element: any; text: string }; +type Token = TextToken | AnchorToken; + +function compactText(value: string): string { + return value.replace(/\s+/g, ' ').trim(); +} + +function textBeforeSponsorMarker(value: string): string { + return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value); +} + +function sponsorMarkerText(value: string): string | undefined { + return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase(); +} + +function blockTokens($: cheerio.CheerioAPI, node: any): Token[] { + if (node.type === 'text') { + const text = compactText(node.data ?? ''); + return text ? [{ type: 'text', text }] : []; + } + + if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) { + return [{ type: 'anchor', element: node, text: compactText($(node).text()) }]; + } + + return $(node) + .contents() + .toArray() + .flatMap((child) => blockTokens($, child)); +} + +function localContext($: cheerio.CheerioAPI, element: any, title: string): string { + const block = $(element).closest('p,li,td,div').first(); + const tokens = blockTokens($, block.get(0)); + const anchorIndex = tokens.findIndex( + (token) => token.type === 'anchor' && token.element === element + ); + if (anchorIndex === -1) { + return title; + } + + const parts: string[] = []; + const previousText = tokens + .slice(0, anchorIndex) + .reverse() + .find((token): token is TextToken => token.type === 'text')?.text; + const marker = previousText ? sponsorMarkerText(previousText) : undefined; + if (marker) { + parts.push(marker); + } + parts.push(title); + + for (const token of tokens.slice(anchorIndex + 1)) { + if (token.type === 'anchor') { + break; + } + parts.push(textBeforeSponsorMarker(token.text)); + } + + return compactText(parts.join(' ')); +} + export const genericParser: ParserPlugin = { name: 'generic', matches: () => true, @@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = { .toArray() .map((element) => { const anchor = $(element); - const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || ''; + const title = compactText(anchor.text()) || anchor.attr('aria-label') || ''; const url = anchor.attr('href') ?? ''; - const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim(); + const context = localContext($, element, title); return { url, diff --git a/src/run/runCatalog.ts b/src/run/runCatalog.ts index b7114d0..edb024a 100644 --- a/src/run/runCatalog.ts +++ b/src/run/runCatalog.ts @@ -35,6 +35,18 @@ function issueDate(date: string): string { return new Date(date).toISOString().slice(0, 10); } +function sponsorDescription(linkTitle: string, description: string): string { + return description + .replace(/\b(?:sponsor|sponsored|advertisement|partner)\b/i, '') + .replace(new RegExp(`^\\s*${escapeRegExp(linkTitle)}\\s*(?:[-:–—]|\\s)+`, 'i'), '') + .replace(/\s+/g, ' ') + .trim(); +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + export async function runCatalog(options: RunOptions): Promise { const config = normalizeConfig(options.config); const state = new StateStore(config.stateFile); @@ -76,7 +88,7 @@ export async function runCatalog(options: RunOptions): Promise { Newsletter: newsletterName(message.from), Sponsor: link.title, Link: link.normalizedUrl, - Description: link.description ?? '' + Description: sponsorDescription(link.title, link.description ?? '') }); continue; } diff --git a/tests/excel.test.ts b/tests/excel.test.ts index 9c205de..4ca9364 100644 --- a/tests/excel.test.ts +++ b/tests/excel.test.ts @@ -35,5 +35,9 @@ describe('ExcelWriter', () => { const workbook = XLSX.readFile(path); expect(workbook.SheetNames[0]).toBe('A Very Long Newsletter Name Tha'); expect(workbook.SheetNames[0].length).toBe(31); + const rows = XLSX.utils.sheet_to_json>( + workbook.Sheets[workbook.SheetNames[0]] + ); + expect(rows[0]).not.toHaveProperty('Source Newsletter'); }); }); diff --git a/tests/googleSheets.test.ts b/tests/googleSheets.test.ts new file mode 100644 index 0000000..97c33a8 --- /dev/null +++ b/tests/googleSheets.test.ts @@ -0,0 +1,79 @@ +import { describe, expect, it } from 'vitest'; +import { GoogleSheetsWriter } from '../src/output/googleSheets.js'; + +describe('GoogleSheetsWriter', () => { + it('creates missing sheets and appends content, sponsor, and dead-link rows', async () => { + const calls: unknown[] = []; + const sheets = { + spreadsheets: { + get: async () => ({ + data: { sheets: [{ properties: { title: 'Sponsored Links' } }] } + }), + batchUpdate: async (request: unknown) => { + calls.push(request); + }, + values: { + append: async (request: unknown) => { + calls.push(request); + } + } + } + }; + + await new GoogleSheetsWriter('sheet-1', undefined, sheets).write({ + rows: [ + { + 'Source Newsletter': 'A Very Long Newsletter Name That Is Fine In Google Sheets', + Title: '=Formula', + 'Link URL': 'https://example.com' + } + ], + sponsors: [{ Newsletter: 'Weekly', Sponsor: 'Acme', Link: 'https://sponsor.example' }], + deadLinks: [{ URL: 'https://dead.example', Status: '404' }] + }); + + expect(calls[0]).toMatchObject({ + spreadsheetId: 'sheet-1', + requestBody: { + requests: [ + { + addSheet: { + properties: { title: 'A Very Long Newsletter Name That Is Fine In Google Sheets' } + } + }, + { addSheet: { properties: { title: 'Dead Links' } } } + ] + } + }); + expect(calls).toContainEqual( + expect.objectContaining({ + spreadsheetId: 'sheet-1', + range: "'A Very Long Newsletter Name That Is Fine In Google Sheets'!A1", + requestBody: { + values: [ + [ + 'Issue Date', + 'Category', + 'Link URL', + 'Title', + 'Description', + 'Page Title + Meta', + 'Also In' + ], + ['', '', 'https://example.com', "'=Formula", '', '', ''] + ] + } + }) + ); + expect(calls).toContainEqual( + expect.objectContaining({ + range: "'Sponsored Links'!A1" + }) + ); + expect(calls).toContainEqual( + expect.objectContaining({ + range: "'Dead Links'!A1" + }) + ); + }); +}); diff --git a/tests/parsing.test.ts b/tests/parsing.test.ts index a78959b..fc58624 100644 --- a/tests/parsing.test.ts +++ b/tests/parsing.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest'; +import { genericParser } from '../src/parsing/generic.js'; import { selectParser } from '../src/parsing/plugins.js'; describe('parser plugin selection', () => { @@ -11,3 +12,27 @@ describe('parser plugin selection', () => { ).toBe('generic'); }); }); + +describe('generic parser', () => { + it('keeps descriptions local to each link when many links share a container', () => { + const links = genericParser.parse({ + html: ` +
+

CSS & HTML Tools

+ Cascade - CSS property icons. + Fancy Frames - Decorative border generator. + SPONSORED + flexboxle - A daily puzzle game to master CSS Flexbox. + Typescale AI - A typescale generator. +
+ ` + }); + + expect(links.map((link) => link.description)).toEqual([ + 'Cascade - CSS property icons.', + 'Fancy Frames - Decorative border generator.', + 'SPONSORED flexboxle - A daily puzzle game to master CSS Flexbox.', + 'Typescale AI - A typescale generator.' + ]); + }); +}); diff --git a/tests/run.test.ts b/tests/run.test.ts index 4becabd..e9c81ae 100644 --- a/tests/run.test.ts +++ b/tests/run.test.ts @@ -41,4 +41,49 @@ describe('run orchestration', () => { expect(result.linksExtracted).toBe(1); expect(writes).toHaveLength(0); }); + + it('only sends locally marked sponsored links to the sponsored output', async () => { + const stateFile = join(dir, 'state.json'); + const writes: any[] = []; + + await runCatalog({ + config: { + gmail: { folder: 'Newsletters' }, + output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } }, + stateFile + }, + messages: [ + { + id: 'msg-1', + messageId: '', + from: 'Web Tools Weekly ', + date: '2026-05-16T00:00:00.000Z', + html: ` +
+ Cascade - CSS property icons. + Fancy Frames - Decorative borders. + SPONSORED + flexboxle - A daily puzzle game. + Typescale AI - A typescale generator. +
+ ` + } + ], + writers: [{ write: async (payload) => writes.push(payload) }] + }); + + expect(writes[0].sponsors).toEqual([ + { + Newsletter: 'Web Tools Weekly', + Sponsor: 'flexboxle', + Link: 'https://flexboxle.example/', + Description: 'A daily puzzle game.' + } + ]); + expect(writes[0].rows.map((row: any) => row.Title)).toEqual([ + 'Cascade', + 'Fancy Frames', + 'Typescale AI' + ]); + }); });