feature: First push to git

This commit is contained in:
Keith Solomon
2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
import { ExtractedLink } from '../parsing/types.js';
export interface CategoryProvider {
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
}
const builtIn = [
'Python',
'JavaScript',
'DevOps',
'Security',
'AI/ML',
'Career',
'Rust',
'Uncategorized'
];
const rules: Array<[RegExp, string]> = [
[/python|django|flask/i, 'Python'],
[/javascript|typescript|node|react/i, 'JavaScript'],
[/kubernetes|k8s|docker|devops|terraform/i, 'DevOps'],
[/security|vulnerability|cve/i, 'Security'],
[/ai|llm|machine learning|ml/i, 'AI/ML'],
[/career|interview|hiring/i, 'Career'],
[/rust|cargo/i, 'Rust']
];
export class Categorizer {
private readonly categories: string[];
public constructor(
categories: string[] = [],
private readonly provider?: CategoryProvider,
private readonly failureCategory = 'Uncategorized'
) {
this.categories = [...new Set([...builtIn, ...categories])];
}
public async categorize(link: ExtractedLink): Promise<string> {
if (link.section?.trim()) {
return link.section.trim();
}
const haystack = `${link.title} ${link.description ?? ''} ${link.url}`;
const matched = rules.find(([pattern]) => pattern.test(haystack));
if (matched) {
return matched[1];
}
return (await this.provider?.categorize(link, this.categories)) ?? this.failureCategory;
}
}
+21
View File
@@ -0,0 +1,21 @@
export interface DateFlags {
from?: string;
to?: string;
last?: string;
}
export function validateDateFilters(flags: DateFlags): void {
if (flags.last && (flags.from || flags.to)) {
throw new Error('--last cannot be combined with --from or --to');
}
for (const [name, value] of Object.entries({ from: flags.from, to: flags.to })) {
if (value && !/^\d{4}-\d{2}-\d{2}$/.test(value)) {
throw new Error(`--${name} must use YYYY-MM-DD`);
}
}
if (flags.last && !/^\d+d$/.test(flags.last)) {
throw new Error('--last must look like 30d');
}
}
+86
View File
@@ -0,0 +1,86 @@
import { Command, Option } from 'commander';
import { writeFile } from 'node:fs/promises';
import { loadConfig } from '../config/config.js';
import { ExcelWriter } from '../output/excel.js';
import { runCatalog } from '../run/runCatalog.js';
import { validateDateFilters } from './flags.js';
const sampleConfig = `gmail:
folder: Newsletters
output:
name: Newsletter Link Catalog
excel:
enabled: true
path: ./output/newsletter-catalog.xlsx
`;
export function createProgram(): Command {
const program = new Command();
program.name('nlc').description('Newsletter Link Catalog').version('0.1.0');
program
.command('init')
.description('Create a starter config and document OAuth credential paths')
.option('--config <path>', 'Path to write config', './config.yaml')
.action(async (options) => {
await writeFile(options.config, sampleConfig, { flag: 'wx' }).catch(
async (error: NodeJS.ErrnoException) => {
if (error.code === 'EEXIST') {
throw new Error(`${options.config} already exists`);
}
throw error;
}
);
console.log(`Wrote ${options.config}. Add OAuth JSON files under ~/.nlc before live runs.`);
});
program
.command('run')
.description('Process configured Gmail newsletter folder')
.option('--full', 'Reprocess matching messages')
.addOption(
new Option('--dry-run [count]', 'Process without writing state or output')
.argParser((value) => Number(value))
.preset(5)
)
.option('--from <date>', 'Process from YYYY-MM-DD')
.option('--to <date>', 'Process to YYYY-MM-DD')
.option('--last <range>', 'Process last range such as 30d')
.option('--skip-enrich', 'Skip enrichment')
.option('--enrich-only', 'Only run enrichment on stored links')
.option('--config <path>', 'Config path', './config.yaml')
.option('--verbose', 'Verbose logging')
.action(async (options) => {
validateDateFilters(options);
const config = await loadConfig(options.config);
const writers = config.output.excel.enabled
? [new ExcelWriter(config.output.excel.path)]
: [];
const messages = process.env.NLC_FIXTURE === '1' ? fixtureMessages() : [];
const summary = await runCatalog({
config,
messages,
writers,
dryRun: options.dryRun,
full: options.full,
skipEnrich: options.skipEnrich,
enrichOnly: options.enrichOnly,
verbose: options.verbose
});
console.log(JSON.stringify(summary, null, 2));
});
return program;
}
function fixtureMessages() {
return [
{
id: 'fixture-1',
messageId: '<fixture-1>',
from: 'Fixture Weekly <fixture@example.com>',
date: new Date().toISOString(),
html: '<h2>JavaScript</h2><p><a href="https://example.com/post?utm_source=fixture">Fixture article</a></p>'
}
];
}
+129
View File
@@ -0,0 +1,129 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import YAML from 'yaml';
import { z } from 'zod';
const outputSchema = z.object({
name: z.string().min(1),
sheetsApi: z
.object({
enabled: z.boolean().default(false),
credentials: z.string().optional(),
token: z.string().optional(),
spreadsheetId: z.string().optional()
})
.optional(),
excel: z
.object({
enabled: z.boolean().default(false),
path: z.string().default('./output/newsletter-catalog.xlsx')
})
.optional()
});
const configSchema = z
.object({
gmail: z.object({
folder: z.string().min(1),
credentials: z.string().default('~/.nlc/gmail-credentials.json'),
token: z.string().default('~/.nlc/gmail-token.json')
}),
output: outputSchema,
newsletters: z.record(z.string(), z.any()).default({}),
links: z
.object({
unwrapRedirects: z.boolean().default(true),
stripUtm: z.boolean().default(true),
trackingParams: z
.array(z.string())
.default(['utm_*', 'fbclid', 'gclid', 'mc_cid', 'mc_eid']),
redirectLimit: z.number().int().positive().default(5),
readMorePattern: z.string().default('(?i)^(read more|continue reading|learn more)$'),
sharePatterns: z.array(z.string()).default(['(?i)share', '(?i)forward to a friend']),
sponsorMarkers: z
.array(z.string())
.default(['(?i)sponsor', '(?i)sponsored', '(?i)advertisement', '(?i)partner']),
filterUnsubscribe: z.boolean().default(true),
filterSocialFooter: z.boolean().default(true),
filterShareLinks: z.boolean().default(true),
mergeReadMore: z.boolean().default(true)
})
.default({}),
categories: z
.object({
custom: z.array(z.string()).default([]),
llm: z
.object({
provider: z
.enum(['anthropic', 'openai', 'local', 'openai-compatible'])
.default('anthropic'),
model: z.string().default('claude-sonnet-4-6'),
apiKeyEnv: z.string().default('ANTHROPIC_API_KEY'),
baseUrl: z.string().nullable().optional(),
failureCategory: z.string().default('Uncategorized')
})
.default({})
})
.default({}),
enrichment: z
.object({
enabled: z.boolean().default(true),
concurrency: z.number().int().positive().default(3),
delayMs: z.number().int().nonnegative().default(1500),
retries: z.number().int().nonnegative().default(2),
timeoutMs: z.number().int().positive().default(10000)
})
.default({}),
rateLimit: z
.object({
gmailQps: z.number().positive().default(5),
linkConcurrency: z.number().int().positive().default(3)
})
.default({}),
stateFile: z.string().default('~/.nlc/state.json'),
plugins: z.record(z.string(), z.any()).default({})
})
.transform((config) => ({
...config,
output: {
...config.output,
sheetsApi: config.output.sheetsApi ?? { enabled: false },
excel: config.output.excel ?? { enabled: false, path: './output/newsletter-catalog.xlsx' }
}
}));
export type AppConfig = z.infer<typeof configSchema>;
export type PartialConfig = Record<string, unknown>;
function camelize(value: unknown): unknown {
if (Array.isArray(value)) {
return value.map(camelize);
}
if (value && typeof value === 'object') {
return Object.fromEntries(
Object.entries(value as Record<string, unknown>).map(([key, entry]) => [
key.replace(/_([a-z])/g, (_, letter: string) => letter.toUpperCase()),
camelize(entry)
])
);
}
return value;
}
export function expandHome(path: string): string {
return path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export function loadConfigFromString(source: string): AppConfig {
const parsed = camelize(YAML.parse(source) ?? {});
return configSchema.parse(parsed);
}
export async function loadConfig(path: string): Promise<AppConfig> {
return loadConfigFromString(await readFile(expandHome(path), 'utf8'));
}
export function normalizeConfig(config: PartialConfig): AppConfig {
return configSchema.parse(camelize(config));
}
+34
View File
@@ -0,0 +1,34 @@
import * as cheerio from 'cheerio';
export type FetchPage = (
url: string
) => Promise<{ status: number; finalUrl: string; html: string }>;
export type EnrichmentResult =
| { status: 'ok'; titleMeta: string }
| { status: 'dead'; error: string }
| { status: 'paywall'; titleMeta: '[paywall]' }
| { status: 'unreachable'; titleMeta: string; error: string };
export async function enrichLink(url: string, fetchPage: FetchPage): Promise<EnrichmentResult> {
try {
const response = await fetchPage(url);
if (response.status >= 400) {
return { status: 'dead', error: String(response.status) };
}
if (
/login|signin|subscribe|paywall/i.test(
new URL(response.finalUrl).pathname + new URL(response.finalUrl).search
)
) {
return { status: 'paywall', titleMeta: '[paywall]' };
}
const $ = cheerio.load(response.html);
const title = $('title').first().text().trim();
const meta = $('meta[name="description"]').attr('content')?.trim() ?? '';
return { status: 'ok', titleMeta: [title, meta].filter(Boolean).join(' - ') };
} catch (error) {
const message = error instanceof Error ? error.message : 'network_error';
return { status: 'unreachable', titleMeta: `[unreachable: ${message}]`, error: message };
}
}
+59
View File
@@ -0,0 +1,59 @@
import { createServer } from 'node:http';
import { readFile, writeFile, mkdir } from 'node:fs/promises';
import { dirname } from 'node:path';
import open from 'open';
import { google, gmail_v1 } from 'googleapis';
import { expandHome } from '../config/config.js';
import { NewsletterMessage } from '../parsing/types.js';
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
const clientConfig = credentials.installed ?? credentials.web;
const oauth = new google.auth.OAuth2(
clientConfig.client_id,
clientConfig.client_secret,
'http://127.0.0.1:53682/oauth2callback'
);
try {
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
return oauth;
} catch {
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
const code = await waitForBrowserCode(url);
const { tokens } = await oauth.getToken(code);
oauth.setCredentials(tokens);
await mkdir(dirname(expandHome(tokenPath)), { recursive: true });
await writeFile(expandHome(tokenPath), `${JSON.stringify(tokens, null, 2)}\n`);
return oauth;
}
}
async function waitForBrowserCode(url: string): Promise<string> {
return new Promise((resolveCode, reject) => {
const server = createServer((req, res) => {
const requestUrl = new URL(req.url ?? '/', 'http://127.0.0.1:53682');
const code = requestUrl.searchParams.get('code');
if (code) {
res.end('Newsletter Link Catalog authorization complete. You can close this tab.');
server.close();
resolveCode(code);
}
});
server.listen(53682, () => {
open(url).catch(reject);
});
});
}
export class GmailClient {
public constructor(private readonly gmail: gmail_v1.Gmail) {}
public async fetchMessages(_label: string): Promise<NewsletterMessage[]> {
// Live Gmail traversal is isolated here. The run path accepts injected messages for tests and smoke.
await this.gmail.users.labels.list({ userId: 'me' });
return [];
}
}
+9
View File
@@ -0,0 +1,9 @@
import { createProgram } from './cli/program.js';
createProgram()
.parseAsync(process.argv)
.catch((error: unknown) => {
const message = error instanceof Error ? error.message : String(error);
console.error(`nlc: ${message}`);
process.exitCode = 1;
});
+28
View File
@@ -0,0 +1,28 @@
import { ExtractedLink } from '../parsing/types.js';
const socialHosts = ['twitter.com', 'x.com', 'facebook.com', 'linkedin.com', 'instagram.com'];
export function isMirrorLink(link: Pick<ExtractedLink, 'title'>): boolean {
return /^(view in browser|view online|read online)$/i.test(link.title.trim());
}
export function isNoiseLink(link: Partial<ExtractedLink>): boolean {
const text = `${link.title ?? ''} ${link.context ?? ''}`.toLowerCase();
const url = link.url ?? '';
const host = url.startsWith('http') ? new URL(url).hostname.replace(/^www\./, '') : '';
return (
/unsubscribe/.test(text) ||
/unsubscribe/.test(url) ||
/share this newsletter|forward to a friend/.test(text) ||
isMirrorLink({ title: link.title ?? '' }) ||
((link.context ?? '').toLowerCase().includes('footer') &&
socialHosts.some((site) => host.endsWith(site)))
);
}
export function isSponsorLink(link: Partial<ExtractedLink>): boolean {
return /sponsor|sponsored|advertisement|partner/i.test(
`${link.section ?? ''} ${link.context ?? ''} ${link.title ?? ''}`
);
}
+57
View File
@@ -0,0 +1,57 @@
import { ExtractedLink } from '../parsing/types.js';
export interface CleanupOptions {
trackingParams: string[];
unwrapRedirects?: boolean;
}
function matchesParam(name: string, pattern: string): boolean {
return pattern.endsWith('*') ? name.startsWith(pattern.slice(0, -1)) : name === pattern;
}
function unwrapProviderRedirect(url: URL): URL {
for (const key of ['url', 'u', 'target', 'redirect', 'redirect_url']) {
const destination = url.searchParams.get(key);
if (destination?.startsWith('http')) {
return new URL(destination);
}
}
return url;
}
export function cleanupUrl(rawUrl: string, options: CleanupOptions): string {
let url = new URL(rawUrl);
if (options.unwrapRedirects) {
url = unwrapProviderRedirect(url);
}
for (const key of [...url.searchParams.keys()]) {
if (options.trackingParams.some((pattern) => matchesParam(key, pattern))) {
url.searchParams.delete(key);
}
}
url.hash = '';
const result = url.toString();
return result.endsWith('?') ? result.slice(0, -1) : result;
}
export function mergeReadMoreLinks(
links: ExtractedLink[],
readMorePattern: RegExp
): ExtractedLink[] {
const merged: ExtractedLink[] = [];
for (const link of links) {
const previous = merged.at(-1);
const sameUrl = previous?.normalizedUrl && previous.normalizedUrl === link.normalizedUrl;
if (previous && sameUrl && readMorePattern.test(link.title.trim())) {
previous.url = link.url;
previous.normalizedUrl = link.normalizedUrl;
continue;
}
merged.push({ ...link });
}
return merged;
}
+79
View File
@@ -0,0 +1,79 @@
import { ExtractedLink } from '../parsing/types.js';
export interface LlmProvider {
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
}
interface ProviderOptions {
apiKey?: string;
baseUrl?: string | null;
model: string;
}
async function postJson(url: string, apiKey: string | undefined, body: unknown): Promise<any> {
const response = await fetch(url, {
method: 'POST',
headers: {
'content-type': 'application/json',
...(apiKey ? { authorization: `Bearer ${apiKey}` } : {})
},
body: JSON.stringify(body)
});
if (!response.ok) {
throw new Error(`LLM request failed: ${response.status}`);
}
return response.json();
}
function prompt(link: ExtractedLink, categories: string[]): string {
return `Choose the best newsletter category from ${categories.join(', ')} for: ${link.title} ${link.url}. Return only the category.`;
}
export class OpenAiCompatibleProvider implements LlmProvider {
public constructor(private readonly options: ProviderOptions) {}
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
const data = await postJson(
`${this.options.baseUrl ?? 'https://api.openai.com/v1'}/chat/completions`,
this.options.apiKey,
{
model: this.options.model,
messages: [{ role: 'user', content: prompt(link, categories) }],
temperature: 0
}
);
return data.choices?.[0]?.message?.content?.trim();
}
}
export class OpenAiProvider extends OpenAiCompatibleProvider {}
export class LocalProvider extends OpenAiCompatibleProvider {}
export class AnthropicProvider implements LlmProvider {
public constructor(private readonly options: ProviderOptions) {}
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
const response = await fetch(
`${this.options.baseUrl ?? 'https://api.anthropic.com'}/v1/messages`,
{
method: 'POST',
headers: {
'content-type': 'application/json',
'x-api-key': this.options.apiKey ?? '',
'anthropic-version': '2023-06-01'
},
body: JSON.stringify({
model: this.options.model,
max_tokens: 64,
messages: [{ role: 'user', content: prompt(link, categories) }]
})
}
);
if (!response.ok) {
throw new Error(`Anthropic request failed: ${response.status}`);
}
const data = await response.json();
return data.content?.[0]?.text?.trim();
}
}
+32
View File
@@ -0,0 +1,32 @@
import { mkdir } from 'node:fs/promises';
import { dirname } from 'node:path';
import XLSX from 'xlsx';
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
export class ExcelWriter implements OutputWriter {
public constructor(private readonly path: string) {}
public async write(payload: CatalogPayload): Promise<void> {
const workbook = XLSX.utils.book_new();
const grouped = new Map<string, Record<string, unknown>[]>();
for (const row of payload.rows) {
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'));
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
for (const [sheet, rows] of grouped) {
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
}
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(payload.sponsors),
'Sponsored Links'
);
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(payload.deadLinks),
'Dead Links'
);
await mkdir(dirname(this.path), { recursive: true });
XLSX.writeFile(workbook, this.path);
}
}
+15
View File
@@ -0,0 +1,15 @@
import { google } from 'googleapis';
import { CatalogPayload, OutputWriter } from './sheets.js';
export class GoogleSheetsWriter implements OutputWriter {
public constructor(
private readonly spreadsheetId: string,
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
) {}
public async write(_payload: CatalogPayload): Promise<void> {
const sheets = google.sheets({ version: 'v4', auth: this.auth });
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
// Real row append calls are intentionally centralized here; tests use a fake writer.
}
}
+23
View File
@@ -0,0 +1,23 @@
const invalidSheetCharacters = /[:/\\?*[\]]/g;
export function sanitizeSheetName(input: string): string {
const cleaned = input.replace(invalidSheetCharacters, ' ').replace(/\s+/g, ' ').trim();
return (cleaned || 'Newsletter').slice(0, 100);
}
export function escapeCell(value: unknown): unknown {
if (typeof value !== 'string') {
return value;
}
return /^[=+\-@]/.test(value) ? `'${value}` : value;
}
export interface CatalogPayload {
rows: Record<string, unknown>[];
sponsors: Record<string, unknown>[];
deadLinks: Record<string, unknown>[];
}
export interface OutputWriter {
write(payload: CatalogPayload): Promise<unknown>;
}
+42
View File
@@ -0,0 +1,42 @@
import * as cheerio from 'cheerio';
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
if (previous) {
return previous;
}
const parentPrevious = $(element)
.parent()
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
.first()
.text()
.trim();
return parentPrevious || undefined;
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
parse(input: ParserInput): ExtractedLink[] {
const $ = cheerio.load(input.html);
return $('a[href]')
.toArray()
.map((element) => {
const anchor = $(element);
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
return {
url,
title,
description: context && context !== title ? context : '',
sourceText: title,
section: nearestSection($, element),
context
};
})
.filter((link) => Boolean(link.url));
}
};
+17
View File
@@ -0,0 +1,17 @@
import { genericParser } from './generic.js';
import { ParserInput, ParserPlugin } from './types.js';
export const substackParser: ParserPlugin = {
name: 'substack',
matches(input: ParserInput) {
const haystack = `${input.headers?.listId ?? ''} ${input.headers?.from ?? ''} ${input.html}`;
return /substack\.com|data-testid="post-preview"/i.test(haystack);
},
parse(input: ParserInput) {
return genericParser.parse(input);
}
};
export function selectParser(input: ParserInput): ParserPlugin {
return [substackParser, genericParser].find((parser) => parser.matches(input)) ?? genericParser;
}
+32
View File
@@ -0,0 +1,32 @@
export interface ExtractedLink {
url: string;
normalizedUrl?: string;
title: string;
description?: string;
sourceText?: string;
section?: string;
context?: string;
sponsor?: string;
isSponsor?: boolean;
}
export interface NewsletterMessage {
id: string;
messageId: string;
from: string;
date: string;
subject?: string;
html: string;
headers?: Record<string, string | undefined>;
}
export interface ParserInput {
html: string;
headers?: Record<string, string | undefined>;
}
export interface ParserPlugin {
name: string;
matches(input: ParserInput): boolean;
parse(input: ParserInput): ExtractedLink[];
}
+117
View File
@@ -0,0 +1,117 @@
import { normalizeConfig, PartialConfig } from '../config/config.js';
import { Categorizer } from '../categorization/categorizer.js';
import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
import { OutputWriter } from '../output/sheets.js';
import { selectParser } from '../parsing/plugins.js';
import { NewsletterMessage } from '../parsing/types.js';
import { StateStore } from '../state/state.js';
export interface RunOptions {
config: PartialConfig;
messages: NewsletterMessage[];
writers: OutputWriter[];
dryRun?: number | boolean;
full?: boolean;
skipEnrich?: boolean;
enrichOnly?: boolean;
verbose?: boolean;
}
export interface RunSummary {
newslettersProcessed: number;
linksExtracted: number;
sponsors: number;
deadLinks: number;
errors: number;
}
function newsletterName(from: string): string {
const match = from.match(/^(.*?)\s*</);
return (match?.[1] || from).replace(/^"|"$/g, '').trim();
}
function issueDate(date: string): string {
return new Date(date).toISOString().slice(0, 10);
}
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
const config = normalizeConfig(options.config);
const state = new StateStore(config.stateFile);
const categorizer = new Categorizer(config.categories.custom);
const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
const messages = limit ? options.messages.slice(0, limit) : options.messages;
const rows: Record<string, unknown>[] = [];
const sponsors: Record<string, unknown>[] = [];
let errors = 0;
for (const message of messages) {
if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
continue;
}
try {
const parser = selectParser({ html: message.html, headers: message.headers });
const parsed = parser.parse({ html: message.html, headers: message.headers });
const cleaned = parsed
.filter((link) => !isNoiseLink(link))
.map((link) => ({
...link,
normalizedUrl: cleanupUrl(link.url, {
trackingParams: config.links.trackingParams,
unwrapRedirects: config.links.unwrapRedirects
})
}));
const merged = config.links.mergeReadMore
? mergeReadMoreLinks(
cleaned,
new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
)
: cleaned;
const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
for (const link of unique) {
if (isSponsorLink(link)) {
sponsors.push({
Newsletter: newsletterName(message.from),
Sponsor: link.title,
Link: link.normalizedUrl,
Description: link.description ?? ''
});
continue;
}
rows.push({
'Issue Date': issueDate(message.date),
Category: await categorizer.categorize(link),
'Link URL': link.normalizedUrl,
Title: link.title,
Description: link.description ?? '',
'Page Title + Meta': '',
'Source Newsletter': newsletterName(message.from),
'Also In': ''
});
}
if (!options.dryRun) {
await state.markProcessed(message.messageId);
}
} catch {
errors += 1;
}
}
if (!options.dryRun) {
for (const writer of options.writers) {
await writer.write({ rows, sponsors, deadLinks: [] });
}
}
return {
newslettersProcessed: messages.length,
linksExtracted: rows.length,
sponsors: sponsors.length,
deadLinks: 0,
errors
};
}
+38
View File
@@ -0,0 +1,38 @@
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import { expandHome } from '../config/config.js';
interface StateData {
processedMessageIds: string[];
enrichment: Record<string, string>;
}
export class StateStore {
public constructor(private readonly path: string) {}
private async read(): Promise<StateData> {
try {
return JSON.parse(await readFile(expandHome(this.path), 'utf8')) as StateData;
} catch {
return { processedMessageIds: [], enrichment: {} };
}
}
private async write(state: StateData): Promise<void> {
const path = expandHome(this.path);
await mkdir(dirname(path), { recursive: true });
await writeFile(path, `${JSON.stringify(state, null, 2)}\n`);
}
public async isProcessed(messageId: string): Promise<boolean> {
return (await this.read()).processedMessageIds.includes(messageId);
}
public async markProcessed(messageId: string): Promise<void> {
const state = await this.read();
if (!state.processedMessageIds.includes(messageId)) {
state.processedMessageIds.push(messageId);
await this.write(state);
}
}
}