✨ feature: Enhance output options with Google Sheets integration and improve Excel writer functionality
This commit is contained in:
+28
-3
@@ -3,6 +3,8 @@ import { writeFile } from 'node:fs/promises';
|
||||
import { loadConfig } from '../config/config.js';
|
||||
import { createGmailClient } from '../gmail/client.js';
|
||||
import { ExcelWriter } from '../output/excel.js';
|
||||
import { createGoogleSheetsWriter } from '../output/googleSheets.js';
|
||||
import { OutputWriter } from '../output/sheets.js';
|
||||
import { runCatalog } from '../run/runCatalog.js';
|
||||
import { validateDateFilters } from './flags.js';
|
||||
|
||||
@@ -54,9 +56,7 @@ export function createProgram(): Command {
|
||||
.action(async (options) => {
|
||||
validateDateFilters(options);
|
||||
const config = await loadConfig(options.config);
|
||||
const writers = config.output.excel.enabled
|
||||
? [new ExcelWriter(config.output.excel.path)]
|
||||
: [];
|
||||
const writers = await createWriters(config);
|
||||
const messages =
|
||||
process.env.NLC_FIXTURE === '1'
|
||||
? fixtureMessages()
|
||||
@@ -82,6 +82,31 @@ export function createProgram(): Command {
|
||||
return program;
|
||||
}
|
||||
|
||||
async function createWriters(
|
||||
config: Awaited<ReturnType<typeof loadConfig>>
|
||||
): Promise<OutputWriter[]> {
|
||||
const writers: OutputWriter[] = [];
|
||||
if (config.output.excel.enabled) {
|
||||
writers.push(new ExcelWriter(config.output.excel.path));
|
||||
}
|
||||
if (config.output.sheetsApi.enabled) {
|
||||
if (!config.output.sheetsApi.credentials || !config.output.sheetsApi.token) {
|
||||
throw new Error('Google Sheets output requires sheets_api credentials and token paths');
|
||||
}
|
||||
if (!config.output.sheetsApi.spreadsheetId) {
|
||||
throw new Error('Google Sheets output requires output.sheets_api.spreadsheet_id');
|
||||
}
|
||||
writers.push(
|
||||
await createGoogleSheetsWriter({
|
||||
credentials: config.output.sheetsApi.credentials,
|
||||
token: config.output.sheetsApi.token,
|
||||
spreadsheetId: config.output.sheetsApi.spreadsheetId
|
||||
})
|
||||
);
|
||||
}
|
||||
return writers;
|
||||
}
|
||||
|
||||
async function fetchGmailMessages(
|
||||
config: Awaited<ReturnType<typeof loadConfig>>,
|
||||
options: { dryRun?: number | boolean; from?: string; to?: string; last?: string }
|
||||
|
||||
+9
-1
@@ -10,6 +10,14 @@ import { NewsletterMessage } from '../parsing/types.js';
|
||||
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
|
||||
|
||||
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
|
||||
return authorizeGoogleOAuth(credentialsPath, tokenPath, gmailScopes);
|
||||
}
|
||||
|
||||
export async function authorizeGoogleOAuth(
|
||||
credentialsPath: string,
|
||||
tokenPath: string,
|
||||
scopes: string[]
|
||||
) {
|
||||
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
|
||||
const clientConfig = credentials.installed ?? credentials.web;
|
||||
const oauth = new google.auth.OAuth2(
|
||||
@@ -22,7 +30,7 @@ export async function authorizeGmail(credentialsPath: string, tokenPath: string)
|
||||
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
|
||||
return oauth;
|
||||
} catch {
|
||||
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
|
||||
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: scopes });
|
||||
const code = await waitForBrowserCode(url);
|
||||
const { tokens } = await oauth.getToken(code);
|
||||
oauth.setCredentials(tokens);
|
||||
|
||||
+19
-1
@@ -3,6 +3,16 @@ import { dirname } from 'node:path';
|
||||
import XLSX from 'xlsx';
|
||||
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
|
||||
|
||||
const contentColumns = [
|
||||
'Issue Date',
|
||||
'Category',
|
||||
'Link URL',
|
||||
'Title',
|
||||
'Description',
|
||||
'Page Title + Meta',
|
||||
'Also In'
|
||||
];
|
||||
|
||||
export class ExcelWriter implements OutputWriter {
|
||||
public constructor(private readonly path: string) {}
|
||||
|
||||
@@ -14,7 +24,11 @@ export class ExcelWriter implements OutputWriter {
|
||||
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
|
||||
}
|
||||
for (const [sheet, rows] of grouped) {
|
||||
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
|
||||
XLSX.utils.book_append_sheet(
|
||||
workbook,
|
||||
XLSX.utils.json_to_sheet(rows.map(toContentOutputRow), { header: contentColumns }),
|
||||
sheet
|
||||
);
|
||||
}
|
||||
XLSX.utils.book_append_sheet(
|
||||
workbook,
|
||||
@@ -30,3 +44,7 @@ export class ExcelWriter implements OutputWriter {
|
||||
XLSX.writeFile(workbook, this.path);
|
||||
}
|
||||
}
|
||||
|
||||
function toContentOutputRow(row: Record<string, unknown>): Record<string, unknown> {
|
||||
return Object.fromEntries(contentColumns.map((column) => [column, row[column] ?? '']));
|
||||
}
|
||||
|
||||
@@ -1,15 +1,97 @@
|
||||
import { google } from 'googleapis';
|
||||
import { CatalogPayload, OutputWriter } from './sheets.js';
|
||||
import { authorizeGoogleOAuth } from '../gmail/client.js';
|
||||
import { CatalogPayload, escapeCell, OutputWriter, sanitizeSheetName } from './sheets.js';
|
||||
|
||||
const sheetsScopes = ['https://www.googleapis.com/auth/spreadsheets'];
|
||||
type SheetsClient = ReturnType<typeof google.sheets> | any;
|
||||
|
||||
const contentColumns = [
|
||||
'Issue Date',
|
||||
'Category',
|
||||
'Link URL',
|
||||
'Title',
|
||||
'Description',
|
||||
'Page Title + Meta',
|
||||
'Also In'
|
||||
];
|
||||
const sponsorColumns = ['Newsletter', 'Sponsor', 'Link', 'Description'];
|
||||
const deadColumns = ['URL', 'Status', 'Source', 'Date'];
|
||||
|
||||
export class GoogleSheetsWriter implements OutputWriter {
|
||||
public constructor(
|
||||
private readonly spreadsheetId: string,
|
||||
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
|
||||
private readonly auth: Parameters<typeof google.sheets>[0]['auth'],
|
||||
private readonly sheetsClient?: SheetsClient
|
||||
) {}
|
||||
|
||||
public async write(_payload: CatalogPayload): Promise<void> {
|
||||
const sheets = google.sheets({ version: 'v4', auth: this.auth });
|
||||
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
|
||||
// Real row append calls are intentionally centralized here; tests use a fake writer.
|
||||
public async write(payload: CatalogPayload): Promise<void> {
|
||||
const sheets = this.sheetsClient ?? google.sheets({ version: 'v4', auth: this.auth });
|
||||
const existing = await this.getExistingSheetNames(sheets);
|
||||
const grouped = this.groupContentRows(payload.rows);
|
||||
const desired = [...grouped.keys(), 'Sponsored Links', 'Dead Links'];
|
||||
const missing = desired.filter((sheet) => !existing.has(sheet));
|
||||
|
||||
if (missing.length > 0) {
|
||||
await sheets.spreadsheets.batchUpdate({
|
||||
spreadsheetId: this.spreadsheetId,
|
||||
requestBody: {
|
||||
requests: missing.map((title) => ({ addSheet: { properties: { title } } }))
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (const [sheet, rows] of grouped) {
|
||||
await this.appendRows(sheets, sheet, contentColumns, rows);
|
||||
}
|
||||
await this.appendRows(sheets, 'Sponsored Links', sponsorColumns, payload.sponsors);
|
||||
await this.appendRows(sheets, 'Dead Links', deadColumns, payload.deadLinks);
|
||||
}
|
||||
|
||||
private async getExistingSheetNames(sheets: SheetsClient): Promise<Set<string>> {
|
||||
const spreadsheet = await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
|
||||
return new Set(
|
||||
(spreadsheet.data.sheets ?? [])
|
||||
.map((sheet: any) => sheet.properties?.title)
|
||||
.filter((title: unknown): title is string => typeof title === 'string')
|
||||
);
|
||||
}
|
||||
|
||||
private groupContentRows(rows: Record<string, unknown>[]): Map<string, Record<string, unknown>[]> {
|
||||
const grouped = new Map<string, Record<string, unknown>[]>();
|
||||
for (const row of rows) {
|
||||
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'), 100);
|
||||
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
|
||||
}
|
||||
return grouped;
|
||||
}
|
||||
|
||||
private async appendRows(
|
||||
sheets: SheetsClient,
|
||||
sheet: string,
|
||||
columns: string[],
|
||||
rows: Record<string, unknown>[]
|
||||
): Promise<void> {
|
||||
if (rows.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
await sheets.spreadsheets.values.append({
|
||||
spreadsheetId: this.spreadsheetId,
|
||||
range: `'${sheet.replaceAll("'", "''")}'!A1`,
|
||||
valueInputOption: 'RAW',
|
||||
insertDataOption: 'INSERT_ROWS',
|
||||
requestBody: {
|
||||
values: [columns, ...rows.map((row) => columns.map((column) => escapeCell(row[column] ?? '')))]
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function createGoogleSheetsWriter(options: {
|
||||
credentials: string;
|
||||
token: string;
|
||||
spreadsheetId: string;
|
||||
}): Promise<GoogleSheetsWriter> {
|
||||
const auth = await authorizeGoogleOAuth(options.credentials, options.token, sheetsScopes);
|
||||
return new GoogleSheetsWriter(options.spreadsheetId, auth);
|
||||
}
|
||||
|
||||
+65
-2
@@ -15,6 +15,69 @@ function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined
|
||||
return parentPrevious || undefined;
|
||||
}
|
||||
|
||||
type TextToken = { type: 'text'; text: string };
|
||||
type AnchorToken = { type: 'anchor'; element: any; text: string };
|
||||
type Token = TextToken | AnchorToken;
|
||||
|
||||
function compactText(value: string): string {
|
||||
return value.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function textBeforeSponsorMarker(value: string): string {
|
||||
return compactText(value.split(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)[0] ?? value);
|
||||
}
|
||||
|
||||
function sponsorMarkerText(value: string): string | undefined {
|
||||
return value.match(/\b(?:sponsor|sponsored|advertisement|partner)\b/i)?.[0].toUpperCase();
|
||||
}
|
||||
|
||||
function blockTokens($: cheerio.CheerioAPI, node: any): Token[] {
|
||||
if (node.type === 'text') {
|
||||
const text = compactText(node.data ?? '');
|
||||
return text ? [{ type: 'text', text }] : [];
|
||||
}
|
||||
|
||||
if (node.type === 'tag' && node.name === 'a' && $(node).attr('href')) {
|
||||
return [{ type: 'anchor', element: node, text: compactText($(node).text()) }];
|
||||
}
|
||||
|
||||
return $(node)
|
||||
.contents()
|
||||
.toArray()
|
||||
.flatMap((child) => blockTokens($, child));
|
||||
}
|
||||
|
||||
function localContext($: cheerio.CheerioAPI, element: any, title: string): string {
|
||||
const block = $(element).closest('p,li,td,div').first();
|
||||
const tokens = blockTokens($, block.get(0));
|
||||
const anchorIndex = tokens.findIndex(
|
||||
(token) => token.type === 'anchor' && token.element === element
|
||||
);
|
||||
if (anchorIndex === -1) {
|
||||
return title;
|
||||
}
|
||||
|
||||
const parts: string[] = [];
|
||||
const previousText = tokens
|
||||
.slice(0, anchorIndex)
|
||||
.reverse()
|
||||
.find((token): token is TextToken => token.type === 'text')?.text;
|
||||
const marker = previousText ? sponsorMarkerText(previousText) : undefined;
|
||||
if (marker) {
|
||||
parts.push(marker);
|
||||
}
|
||||
parts.push(title);
|
||||
|
||||
for (const token of tokens.slice(anchorIndex + 1)) {
|
||||
if (token.type === 'anchor') {
|
||||
break;
|
||||
}
|
||||
parts.push(textBeforeSponsorMarker(token.text));
|
||||
}
|
||||
|
||||
return compactText(parts.join(' '));
|
||||
}
|
||||
|
||||
export const genericParser: ParserPlugin = {
|
||||
name: 'generic',
|
||||
matches: () => true,
|
||||
@@ -24,9 +87,9 @@ export const genericParser: ParserPlugin = {
|
||||
.toArray()
|
||||
.map((element) => {
|
||||
const anchor = $(element);
|
||||
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
|
||||
const title = compactText(anchor.text()) || anchor.attr('aria-label') || '';
|
||||
const url = anchor.attr('href') ?? '';
|
||||
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
|
||||
const context = localContext($, element, title);
|
||||
|
||||
return {
|
||||
url,
|
||||
|
||||
+13
-1
@@ -35,6 +35,18 @@ function issueDate(date: string): string {
|
||||
return new Date(date).toISOString().slice(0, 10);
|
||||
}
|
||||
|
||||
function sponsorDescription(linkTitle: string, description: string): string {
|
||||
return description
|
||||
.replace(/\b(?:sponsor|sponsored|advertisement|partner)\b/i, '')
|
||||
.replace(new RegExp(`^\\s*${escapeRegExp(linkTitle)}\\s*(?:[-:–—]|\\s)+`, 'i'), '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function escapeRegExp(value: string): string {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
|
||||
const config = normalizeConfig(options.config);
|
||||
const state = new StateStore(config.stateFile);
|
||||
@@ -76,7 +88,7 @@ export async function runCatalog(options: RunOptions): Promise<RunSummary> {
|
||||
Newsletter: newsletterName(message.from),
|
||||
Sponsor: link.title,
|
||||
Link: link.normalizedUrl,
|
||||
Description: link.description ?? ''
|
||||
Description: sponsorDescription(link.title, link.description ?? '')
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user