✨feature: First push to git
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
import { ExtractedLink } from '../parsing/types.js';
|
||||
|
||||
export interface CategoryProvider {
|
||||
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
|
||||
}
|
||||
|
||||
const builtIn = [
|
||||
'Python',
|
||||
'JavaScript',
|
||||
'DevOps',
|
||||
'Security',
|
||||
'AI/ML',
|
||||
'Career',
|
||||
'Rust',
|
||||
'Uncategorized'
|
||||
];
|
||||
|
||||
const rules: Array<[RegExp, string]> = [
|
||||
[/python|django|flask/i, 'Python'],
|
||||
[/javascript|typescript|node|react/i, 'JavaScript'],
|
||||
[/kubernetes|k8s|docker|devops|terraform/i, 'DevOps'],
|
||||
[/security|vulnerability|cve/i, 'Security'],
|
||||
[/ai|llm|machine learning|ml/i, 'AI/ML'],
|
||||
[/career|interview|hiring/i, 'Career'],
|
||||
[/rust|cargo/i, 'Rust']
|
||||
];
|
||||
|
||||
export class Categorizer {
|
||||
private readonly categories: string[];
|
||||
|
||||
public constructor(
|
||||
categories: string[] = [],
|
||||
private readonly provider?: CategoryProvider,
|
||||
private readonly failureCategory = 'Uncategorized'
|
||||
) {
|
||||
this.categories = [...new Set([...builtIn, ...categories])];
|
||||
}
|
||||
|
||||
public async categorize(link: ExtractedLink): Promise<string> {
|
||||
if (link.section?.trim()) {
|
||||
return link.section.trim();
|
||||
}
|
||||
|
||||
const haystack = `${link.title} ${link.description ?? ''} ${link.url}`;
|
||||
const matched = rules.find(([pattern]) => pattern.test(haystack));
|
||||
if (matched) {
|
||||
return matched[1];
|
||||
}
|
||||
|
||||
return (await this.provider?.categorize(link, this.categories)) ?? this.failureCategory;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
export interface DateFlags {
|
||||
from?: string;
|
||||
to?: string;
|
||||
last?: string;
|
||||
}
|
||||
|
||||
export function validateDateFilters(flags: DateFlags): void {
|
||||
if (flags.last && (flags.from || flags.to)) {
|
||||
throw new Error('--last cannot be combined with --from or --to');
|
||||
}
|
||||
|
||||
for (const [name, value] of Object.entries({ from: flags.from, to: flags.to })) {
|
||||
if (value && !/^\d{4}-\d{2}-\d{2}$/.test(value)) {
|
||||
throw new Error(`--${name} must use YYYY-MM-DD`);
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.last && !/^\d+d$/.test(flags.last)) {
|
||||
throw new Error('--last must look like 30d');
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
import { Command, Option } from 'commander';
|
||||
import { writeFile } from 'node:fs/promises';
|
||||
import { loadConfig } from '../config/config.js';
|
||||
import { ExcelWriter } from '../output/excel.js';
|
||||
import { runCatalog } from '../run/runCatalog.js';
|
||||
import { validateDateFilters } from './flags.js';
|
||||
|
||||
const sampleConfig = `gmail:
|
||||
folder: Newsletters
|
||||
output:
|
||||
name: Newsletter Link Catalog
|
||||
excel:
|
||||
enabled: true
|
||||
path: ./output/newsletter-catalog.xlsx
|
||||
`;
|
||||
|
||||
export function createProgram(): Command {
|
||||
const program = new Command();
|
||||
program.name('nlc').description('Newsletter Link Catalog').version('0.1.0');
|
||||
|
||||
program
|
||||
.command('init')
|
||||
.description('Create a starter config and document OAuth credential paths')
|
||||
.option('--config <path>', 'Path to write config', './config.yaml')
|
||||
.action(async (options) => {
|
||||
await writeFile(options.config, sampleConfig, { flag: 'wx' }).catch(
|
||||
async (error: NodeJS.ErrnoException) => {
|
||||
if (error.code === 'EEXIST') {
|
||||
throw new Error(`${options.config} already exists`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
);
|
||||
console.log(`Wrote ${options.config}. Add OAuth JSON files under ~/.nlc before live runs.`);
|
||||
});
|
||||
|
||||
program
|
||||
.command('run')
|
||||
.description('Process configured Gmail newsletter folder')
|
||||
.option('--full', 'Reprocess matching messages')
|
||||
.addOption(
|
||||
new Option('--dry-run [count]', 'Process without writing state or output')
|
||||
.argParser((value) => Number(value))
|
||||
.preset(5)
|
||||
)
|
||||
.option('--from <date>', 'Process from YYYY-MM-DD')
|
||||
.option('--to <date>', 'Process to YYYY-MM-DD')
|
||||
.option('--last <range>', 'Process last range such as 30d')
|
||||
.option('--skip-enrich', 'Skip enrichment')
|
||||
.option('--enrich-only', 'Only run enrichment on stored links')
|
||||
.option('--config <path>', 'Config path', './config.yaml')
|
||||
.option('--verbose', 'Verbose logging')
|
||||
.action(async (options) => {
|
||||
validateDateFilters(options);
|
||||
const config = await loadConfig(options.config);
|
||||
const writers = config.output.excel.enabled
|
||||
? [new ExcelWriter(config.output.excel.path)]
|
||||
: [];
|
||||
const messages = process.env.NLC_FIXTURE === '1' ? fixtureMessages() : [];
|
||||
const summary = await runCatalog({
|
||||
config,
|
||||
messages,
|
||||
writers,
|
||||
dryRun: options.dryRun,
|
||||
full: options.full,
|
||||
skipEnrich: options.skipEnrich,
|
||||
enrichOnly: options.enrichOnly,
|
||||
verbose: options.verbose
|
||||
});
|
||||
console.log(JSON.stringify(summary, null, 2));
|
||||
});
|
||||
|
||||
return program;
|
||||
}
|
||||
|
||||
function fixtureMessages() {
|
||||
return [
|
||||
{
|
||||
id: 'fixture-1',
|
||||
messageId: '<fixture-1>',
|
||||
from: 'Fixture Weekly <fixture@example.com>',
|
||||
date: new Date().toISOString(),
|
||||
html: '<h2>JavaScript</h2><p><a href="https://example.com/post?utm_source=fixture">Fixture article</a></p>'
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { homedir } from 'node:os';
|
||||
import { resolve } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
|
||||
const outputSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
sheetsApi: z
|
||||
.object({
|
||||
enabled: z.boolean().default(false),
|
||||
credentials: z.string().optional(),
|
||||
token: z.string().optional(),
|
||||
spreadsheetId: z.string().optional()
|
||||
})
|
||||
.optional(),
|
||||
excel: z
|
||||
.object({
|
||||
enabled: z.boolean().default(false),
|
||||
path: z.string().default('./output/newsletter-catalog.xlsx')
|
||||
})
|
||||
.optional()
|
||||
});
|
||||
|
||||
const configSchema = z
|
||||
.object({
|
||||
gmail: z.object({
|
||||
folder: z.string().min(1),
|
||||
credentials: z.string().default('~/.nlc/gmail-credentials.json'),
|
||||
token: z.string().default('~/.nlc/gmail-token.json')
|
||||
}),
|
||||
output: outputSchema,
|
||||
newsletters: z.record(z.string(), z.any()).default({}),
|
||||
links: z
|
||||
.object({
|
||||
unwrapRedirects: z.boolean().default(true),
|
||||
stripUtm: z.boolean().default(true),
|
||||
trackingParams: z
|
||||
.array(z.string())
|
||||
.default(['utm_*', 'fbclid', 'gclid', 'mc_cid', 'mc_eid']),
|
||||
redirectLimit: z.number().int().positive().default(5),
|
||||
readMorePattern: z.string().default('(?i)^(read more|continue reading|learn more)$'),
|
||||
sharePatterns: z.array(z.string()).default(['(?i)share', '(?i)forward to a friend']),
|
||||
sponsorMarkers: z
|
||||
.array(z.string())
|
||||
.default(['(?i)sponsor', '(?i)sponsored', '(?i)advertisement', '(?i)partner']),
|
||||
filterUnsubscribe: z.boolean().default(true),
|
||||
filterSocialFooter: z.boolean().default(true),
|
||||
filterShareLinks: z.boolean().default(true),
|
||||
mergeReadMore: z.boolean().default(true)
|
||||
})
|
||||
.default({}),
|
||||
categories: z
|
||||
.object({
|
||||
custom: z.array(z.string()).default([]),
|
||||
llm: z
|
||||
.object({
|
||||
provider: z
|
||||
.enum(['anthropic', 'openai', 'local', 'openai-compatible'])
|
||||
.default('anthropic'),
|
||||
model: z.string().default('claude-sonnet-4-6'),
|
||||
apiKeyEnv: z.string().default('ANTHROPIC_API_KEY'),
|
||||
baseUrl: z.string().nullable().optional(),
|
||||
failureCategory: z.string().default('Uncategorized')
|
||||
})
|
||||
.default({})
|
||||
})
|
||||
.default({}),
|
||||
enrichment: z
|
||||
.object({
|
||||
enabled: z.boolean().default(true),
|
||||
concurrency: z.number().int().positive().default(3),
|
||||
delayMs: z.number().int().nonnegative().default(1500),
|
||||
retries: z.number().int().nonnegative().default(2),
|
||||
timeoutMs: z.number().int().positive().default(10000)
|
||||
})
|
||||
.default({}),
|
||||
rateLimit: z
|
||||
.object({
|
||||
gmailQps: z.number().positive().default(5),
|
||||
linkConcurrency: z.number().int().positive().default(3)
|
||||
})
|
||||
.default({}),
|
||||
stateFile: z.string().default('~/.nlc/state.json'),
|
||||
plugins: z.record(z.string(), z.any()).default({})
|
||||
})
|
||||
.transform((config) => ({
|
||||
...config,
|
||||
output: {
|
||||
...config.output,
|
||||
sheetsApi: config.output.sheetsApi ?? { enabled: false },
|
||||
excel: config.output.excel ?? { enabled: false, path: './output/newsletter-catalog.xlsx' }
|
||||
}
|
||||
}));
|
||||
|
||||
export type AppConfig = z.infer<typeof configSchema>;
|
||||
export type PartialConfig = Record<string, unknown>;
|
||||
|
||||
function camelize(value: unknown): unknown {
|
||||
if (Array.isArray(value)) {
|
||||
return value.map(camelize);
|
||||
}
|
||||
if (value && typeof value === 'object') {
|
||||
return Object.fromEntries(
|
||||
Object.entries(value as Record<string, unknown>).map(([key, entry]) => [
|
||||
key.replace(/_([a-z])/g, (_, letter: string) => letter.toUpperCase()),
|
||||
camelize(entry)
|
||||
])
|
||||
);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function expandHome(path: string): string {
|
||||
return path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
|
||||
}
|
||||
|
||||
export function loadConfigFromString(source: string): AppConfig {
|
||||
const parsed = camelize(YAML.parse(source) ?? {});
|
||||
return configSchema.parse(parsed);
|
||||
}
|
||||
|
||||
export async function loadConfig(path: string): Promise<AppConfig> {
|
||||
return loadConfigFromString(await readFile(expandHome(path), 'utf8'));
|
||||
}
|
||||
|
||||
export function normalizeConfig(config: PartialConfig): AppConfig {
|
||||
return configSchema.parse(camelize(config));
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export type FetchPage = (
|
||||
url: string
|
||||
) => Promise<{ status: number; finalUrl: string; html: string }>;
|
||||
|
||||
export type EnrichmentResult =
|
||||
| { status: 'ok'; titleMeta: string }
|
||||
| { status: 'dead'; error: string }
|
||||
| { status: 'paywall'; titleMeta: '[paywall]' }
|
||||
| { status: 'unreachable'; titleMeta: string; error: string };
|
||||
|
||||
export async function enrichLink(url: string, fetchPage: FetchPage): Promise<EnrichmentResult> {
|
||||
try {
|
||||
const response = await fetchPage(url);
|
||||
if (response.status >= 400) {
|
||||
return { status: 'dead', error: String(response.status) };
|
||||
}
|
||||
if (
|
||||
/login|signin|subscribe|paywall/i.test(
|
||||
new URL(response.finalUrl).pathname + new URL(response.finalUrl).search
|
||||
)
|
||||
) {
|
||||
return { status: 'paywall', titleMeta: '[paywall]' };
|
||||
}
|
||||
const $ = cheerio.load(response.html);
|
||||
const title = $('title').first().text().trim();
|
||||
const meta = $('meta[name="description"]').attr('content')?.trim() ?? '';
|
||||
return { status: 'ok', titleMeta: [title, meta].filter(Boolean).join(' - ') };
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'network_error';
|
||||
return { status: 'unreachable', titleMeta: `[unreachable: ${message}]`, error: message };
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
import { createServer } from 'node:http';
|
||||
import { readFile, writeFile, mkdir } from 'node:fs/promises';
|
||||
import { dirname } from 'node:path';
|
||||
import open from 'open';
|
||||
import { google, gmail_v1 } from 'googleapis';
|
||||
import { expandHome } from '../config/config.js';
|
||||
import { NewsletterMessage } from '../parsing/types.js';
|
||||
|
||||
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
|
||||
|
||||
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
|
||||
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
|
||||
const clientConfig = credentials.installed ?? credentials.web;
|
||||
const oauth = new google.auth.OAuth2(
|
||||
clientConfig.client_id,
|
||||
clientConfig.client_secret,
|
||||
'http://127.0.0.1:53682/oauth2callback'
|
||||
);
|
||||
|
||||
try {
|
||||
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
|
||||
return oauth;
|
||||
} catch {
|
||||
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
|
||||
const code = await waitForBrowserCode(url);
|
||||
const { tokens } = await oauth.getToken(code);
|
||||
oauth.setCredentials(tokens);
|
||||
await mkdir(dirname(expandHome(tokenPath)), { recursive: true });
|
||||
await writeFile(expandHome(tokenPath), `${JSON.stringify(tokens, null, 2)}\n`);
|
||||
return oauth;
|
||||
}
|
||||
}
|
||||
|
||||
async function waitForBrowserCode(url: string): Promise<string> {
|
||||
return new Promise((resolveCode, reject) => {
|
||||
const server = createServer((req, res) => {
|
||||
const requestUrl = new URL(req.url ?? '/', 'http://127.0.0.1:53682');
|
||||
const code = requestUrl.searchParams.get('code');
|
||||
if (code) {
|
||||
res.end('Newsletter Link Catalog authorization complete. You can close this tab.');
|
||||
server.close();
|
||||
resolveCode(code);
|
||||
}
|
||||
});
|
||||
server.listen(53682, () => {
|
||||
open(url).catch(reject);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export class GmailClient {
|
||||
public constructor(private readonly gmail: gmail_v1.Gmail) {}
|
||||
|
||||
public async fetchMessages(_label: string): Promise<NewsletterMessage[]> {
|
||||
// Live Gmail traversal is isolated here. The run path accepts injected messages for tests and smoke.
|
||||
await this.gmail.users.labels.list({ userId: 'me' });
|
||||
return [];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
import { createProgram } from './cli/program.js';
|
||||
|
||||
createProgram()
|
||||
.parseAsync(process.argv)
|
||||
.catch((error: unknown) => {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error(`nlc: ${message}`);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
@@ -0,0 +1,28 @@
|
||||
import { ExtractedLink } from '../parsing/types.js';
|
||||
|
||||
const socialHosts = ['twitter.com', 'x.com', 'facebook.com', 'linkedin.com', 'instagram.com'];
|
||||
|
||||
export function isMirrorLink(link: Pick<ExtractedLink, 'title'>): boolean {
|
||||
return /^(view in browser|view online|read online)$/i.test(link.title.trim());
|
||||
}
|
||||
|
||||
export function isNoiseLink(link: Partial<ExtractedLink>): boolean {
|
||||
const text = `${link.title ?? ''} ${link.context ?? ''}`.toLowerCase();
|
||||
const url = link.url ?? '';
|
||||
const host = url.startsWith('http') ? new URL(url).hostname.replace(/^www\./, '') : '';
|
||||
|
||||
return (
|
||||
/unsubscribe/.test(text) ||
|
||||
/unsubscribe/.test(url) ||
|
||||
/share this newsletter|forward to a friend/.test(text) ||
|
||||
isMirrorLink({ title: link.title ?? '' }) ||
|
||||
((link.context ?? '').toLowerCase().includes('footer') &&
|
||||
socialHosts.some((site) => host.endsWith(site)))
|
||||
);
|
||||
}
|
||||
|
||||
export function isSponsorLink(link: Partial<ExtractedLink>): boolean {
|
||||
return /sponsor|sponsored|advertisement|partner/i.test(
|
||||
`${link.section ?? ''} ${link.context ?? ''} ${link.title ?? ''}`
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
import { ExtractedLink } from '../parsing/types.js';
|
||||
|
||||
export interface CleanupOptions {
|
||||
trackingParams: string[];
|
||||
unwrapRedirects?: boolean;
|
||||
}
|
||||
|
||||
function matchesParam(name: string, pattern: string): boolean {
|
||||
return pattern.endsWith('*') ? name.startsWith(pattern.slice(0, -1)) : name === pattern;
|
||||
}
|
||||
|
||||
function unwrapProviderRedirect(url: URL): URL {
|
||||
for (const key of ['url', 'u', 'target', 'redirect', 'redirect_url']) {
|
||||
const destination = url.searchParams.get(key);
|
||||
if (destination?.startsWith('http')) {
|
||||
return new URL(destination);
|
||||
}
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
export function cleanupUrl(rawUrl: string, options: CleanupOptions): string {
|
||||
let url = new URL(rawUrl);
|
||||
if (options.unwrapRedirects) {
|
||||
url = unwrapProviderRedirect(url);
|
||||
}
|
||||
|
||||
for (const key of [...url.searchParams.keys()]) {
|
||||
if (options.trackingParams.some((pattern) => matchesParam(key, pattern))) {
|
||||
url.searchParams.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
url.hash = '';
|
||||
const result = url.toString();
|
||||
return result.endsWith('?') ? result.slice(0, -1) : result;
|
||||
}
|
||||
|
||||
export function mergeReadMoreLinks(
|
||||
links: ExtractedLink[],
|
||||
readMorePattern: RegExp
|
||||
): ExtractedLink[] {
|
||||
const merged: ExtractedLink[] = [];
|
||||
|
||||
for (const link of links) {
|
||||
const previous = merged.at(-1);
|
||||
const sameUrl = previous?.normalizedUrl && previous.normalizedUrl === link.normalizedUrl;
|
||||
if (previous && sameUrl && readMorePattern.test(link.title.trim())) {
|
||||
previous.url = link.url;
|
||||
previous.normalizedUrl = link.normalizedUrl;
|
||||
continue;
|
||||
}
|
||||
merged.push({ ...link });
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
import { ExtractedLink } from '../parsing/types.js';
|
||||
|
||||
export interface LlmProvider {
|
||||
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
|
||||
}
|
||||
|
||||
interface ProviderOptions {
|
||||
apiKey?: string;
|
||||
baseUrl?: string | null;
|
||||
model: string;
|
||||
}
|
||||
|
||||
async function postJson(url: string, apiKey: string | undefined, body: unknown): Promise<any> {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
...(apiKey ? { authorization: `Bearer ${apiKey}` } : {})
|
||||
},
|
||||
body: JSON.stringify(body)
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`LLM request failed: ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
}
|
||||
|
||||
function prompt(link: ExtractedLink, categories: string[]): string {
|
||||
return `Choose the best newsletter category from ${categories.join(', ')} for: ${link.title} ${link.url}. Return only the category.`;
|
||||
}
|
||||
|
||||
export class OpenAiCompatibleProvider implements LlmProvider {
|
||||
public constructor(private readonly options: ProviderOptions) {}
|
||||
|
||||
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
|
||||
const data = await postJson(
|
||||
`${this.options.baseUrl ?? 'https://api.openai.com/v1'}/chat/completions`,
|
||||
this.options.apiKey,
|
||||
{
|
||||
model: this.options.model,
|
||||
messages: [{ role: 'user', content: prompt(link, categories) }],
|
||||
temperature: 0
|
||||
}
|
||||
);
|
||||
return data.choices?.[0]?.message?.content?.trim();
|
||||
}
|
||||
}
|
||||
|
||||
export class OpenAiProvider extends OpenAiCompatibleProvider {}
|
||||
|
||||
export class LocalProvider extends OpenAiCompatibleProvider {}
|
||||
|
||||
export class AnthropicProvider implements LlmProvider {
|
||||
public constructor(private readonly options: ProviderOptions) {}
|
||||
|
||||
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
|
||||
const response = await fetch(
|
||||
`${this.options.baseUrl ?? 'https://api.anthropic.com'}/v1/messages`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'x-api-key': this.options.apiKey ?? '',
|
||||
'anthropic-version': '2023-06-01'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: this.options.model,
|
||||
max_tokens: 64,
|
||||
messages: [{ role: 'user', content: prompt(link, categories) }]
|
||||
})
|
||||
}
|
||||
);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Anthropic request failed: ${response.status}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
return data.content?.[0]?.text?.trim();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
import { mkdir } from 'node:fs/promises';
|
||||
import { dirname } from 'node:path';
|
||||
import XLSX from 'xlsx';
|
||||
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
|
||||
|
||||
export class ExcelWriter implements OutputWriter {
|
||||
public constructor(private readonly path: string) {}
|
||||
|
||||
public async write(payload: CatalogPayload): Promise<void> {
|
||||
const workbook = XLSX.utils.book_new();
|
||||
const grouped = new Map<string, Record<string, unknown>[]>();
|
||||
for (const row of payload.rows) {
|
||||
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'));
|
||||
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
|
||||
}
|
||||
for (const [sheet, rows] of grouped) {
|
||||
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
|
||||
}
|
||||
XLSX.utils.book_append_sheet(
|
||||
workbook,
|
||||
XLSX.utils.json_to_sheet(payload.sponsors),
|
||||
'Sponsored Links'
|
||||
);
|
||||
XLSX.utils.book_append_sheet(
|
||||
workbook,
|
||||
XLSX.utils.json_to_sheet(payload.deadLinks),
|
||||
'Dead Links'
|
||||
);
|
||||
await mkdir(dirname(this.path), { recursive: true });
|
||||
XLSX.writeFile(workbook, this.path);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
import { google } from 'googleapis';
|
||||
import { CatalogPayload, OutputWriter } from './sheets.js';
|
||||
|
||||
export class GoogleSheetsWriter implements OutputWriter {
|
||||
public constructor(
|
||||
private readonly spreadsheetId: string,
|
||||
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
|
||||
) {}
|
||||
|
||||
public async write(_payload: CatalogPayload): Promise<void> {
|
||||
const sheets = google.sheets({ version: 'v4', auth: this.auth });
|
||||
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
|
||||
// Real row append calls are intentionally centralized here; tests use a fake writer.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
const invalidSheetCharacters = /[:/\\?*[\]]/g;
|
||||
|
||||
export function sanitizeSheetName(input: string): string {
|
||||
const cleaned = input.replace(invalidSheetCharacters, ' ').replace(/\s+/g, ' ').trim();
|
||||
return (cleaned || 'Newsletter').slice(0, 100);
|
||||
}
|
||||
|
||||
export function escapeCell(value: unknown): unknown {
|
||||
if (typeof value !== 'string') {
|
||||
return value;
|
||||
}
|
||||
return /^[=+\-@]/.test(value) ? `'${value}` : value;
|
||||
}
|
||||
|
||||
export interface CatalogPayload {
|
||||
rows: Record<string, unknown>[];
|
||||
sponsors: Record<string, unknown>[];
|
||||
deadLinks: Record<string, unknown>[];
|
||||
}
|
||||
|
||||
export interface OutputWriter {
|
||||
write(payload: CatalogPayload): Promise<unknown>;
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
|
||||
|
||||
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
|
||||
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
|
||||
if (previous) {
|
||||
return previous;
|
||||
}
|
||||
const parentPrevious = $(element)
|
||||
.parent()
|
||||
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
|
||||
.first()
|
||||
.text()
|
||||
.trim();
|
||||
return parentPrevious || undefined;
|
||||
}
|
||||
|
||||
export const genericParser: ParserPlugin = {
|
||||
name: 'generic',
|
||||
matches: () => true,
|
||||
parse(input: ParserInput): ExtractedLink[] {
|
||||
const $ = cheerio.load(input.html);
|
||||
return $('a[href]')
|
||||
.toArray()
|
||||
.map((element) => {
|
||||
const anchor = $(element);
|
||||
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
|
||||
const url = anchor.attr('href') ?? '';
|
||||
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
description: context && context !== title ? context : '',
|
||||
sourceText: title,
|
||||
section: nearestSection($, element),
|
||||
context
|
||||
};
|
||||
})
|
||||
.filter((link) => Boolean(link.url));
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,17 @@
|
||||
import { genericParser } from './generic.js';
|
||||
import { ParserInput, ParserPlugin } from './types.js';
|
||||
|
||||
export const substackParser: ParserPlugin = {
|
||||
name: 'substack',
|
||||
matches(input: ParserInput) {
|
||||
const haystack = `${input.headers?.listId ?? ''} ${input.headers?.from ?? ''} ${input.html}`;
|
||||
return /substack\.com|data-testid="post-preview"/i.test(haystack);
|
||||
},
|
||||
parse(input: ParserInput) {
|
||||
return genericParser.parse(input);
|
||||
}
|
||||
};
|
||||
|
||||
export function selectParser(input: ParserInput): ParserPlugin {
|
||||
return [substackParser, genericParser].find((parser) => parser.matches(input)) ?? genericParser;
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
export interface ExtractedLink {
|
||||
url: string;
|
||||
normalizedUrl?: string;
|
||||
title: string;
|
||||
description?: string;
|
||||
sourceText?: string;
|
||||
section?: string;
|
||||
context?: string;
|
||||
sponsor?: string;
|
||||
isSponsor?: boolean;
|
||||
}
|
||||
|
||||
export interface NewsletterMessage {
|
||||
id: string;
|
||||
messageId: string;
|
||||
from: string;
|
||||
date: string;
|
||||
subject?: string;
|
||||
html: string;
|
||||
headers?: Record<string, string | undefined>;
|
||||
}
|
||||
|
||||
export interface ParserInput {
|
||||
html: string;
|
||||
headers?: Record<string, string | undefined>;
|
||||
}
|
||||
|
||||
export interface ParserPlugin {
|
||||
name: string;
|
||||
matches(input: ParserInput): boolean;
|
||||
parse(input: ParserInput): ExtractedLink[];
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
import { normalizeConfig, PartialConfig } from '../config/config.js';
|
||||
import { Categorizer } from '../categorization/categorizer.js';
|
||||
import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
|
||||
import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
|
||||
import { OutputWriter } from '../output/sheets.js';
|
||||
import { selectParser } from '../parsing/plugins.js';
|
||||
import { NewsletterMessage } from '../parsing/types.js';
|
||||
import { StateStore } from '../state/state.js';
|
||||
|
||||
export interface RunOptions {
|
||||
config: PartialConfig;
|
||||
messages: NewsletterMessage[];
|
||||
writers: OutputWriter[];
|
||||
dryRun?: number | boolean;
|
||||
full?: boolean;
|
||||
skipEnrich?: boolean;
|
||||
enrichOnly?: boolean;
|
||||
verbose?: boolean;
|
||||
}
|
||||
|
||||
export interface RunSummary {
|
||||
newslettersProcessed: number;
|
||||
linksExtracted: number;
|
||||
sponsors: number;
|
||||
deadLinks: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
function newsletterName(from: string): string {
|
||||
const match = from.match(/^(.*?)\s*</);
|
||||
return (match?.[1] || from).replace(/^"|"$/g, '').trim();
|
||||
}
|
||||
|
||||
function issueDate(date: string): string {
|
||||
return new Date(date).toISOString().slice(0, 10);
|
||||
}
|
||||
|
||||
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
|
||||
const config = normalizeConfig(options.config);
|
||||
const state = new StateStore(config.stateFile);
|
||||
const categorizer = new Categorizer(config.categories.custom);
|
||||
const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
|
||||
const messages = limit ? options.messages.slice(0, limit) : options.messages;
|
||||
const rows: Record<string, unknown>[] = [];
|
||||
const sponsors: Record<string, unknown>[] = [];
|
||||
let errors = 0;
|
||||
|
||||
for (const message of messages) {
|
||||
if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const parser = selectParser({ html: message.html, headers: message.headers });
|
||||
const parsed = parser.parse({ html: message.html, headers: message.headers });
|
||||
const cleaned = parsed
|
||||
.filter((link) => !isNoiseLink(link))
|
||||
.map((link) => ({
|
||||
...link,
|
||||
normalizedUrl: cleanupUrl(link.url, {
|
||||
trackingParams: config.links.trackingParams,
|
||||
unwrapRedirects: config.links.unwrapRedirects
|
||||
})
|
||||
}));
|
||||
const merged = config.links.mergeReadMore
|
||||
? mergeReadMoreLinks(
|
||||
cleaned,
|
||||
new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
|
||||
)
|
||||
: cleaned;
|
||||
const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
|
||||
|
||||
for (const link of unique) {
|
||||
if (isSponsorLink(link)) {
|
||||
sponsors.push({
|
||||
Newsletter: newsletterName(message.from),
|
||||
Sponsor: link.title,
|
||||
Link: link.normalizedUrl,
|
||||
Description: link.description ?? ''
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
rows.push({
|
||||
'Issue Date': issueDate(message.date),
|
||||
Category: await categorizer.categorize(link),
|
||||
'Link URL': link.normalizedUrl,
|
||||
Title: link.title,
|
||||
Description: link.description ?? '',
|
||||
'Page Title + Meta': '',
|
||||
'Source Newsletter': newsletterName(message.from),
|
||||
'Also In': ''
|
||||
});
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
await state.markProcessed(message.messageId);
|
||||
}
|
||||
} catch {
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
for (const writer of options.writers) {
|
||||
await writer.write({ rows, sponsors, deadLinks: [] });
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
newslettersProcessed: messages.length,
|
||||
linksExtracted: rows.length,
|
||||
sponsors: sponsors.length,
|
||||
deadLinks: 0,
|
||||
errors
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
import { mkdir, readFile, writeFile } from 'node:fs/promises';
|
||||
import { dirname } from 'node:path';
|
||||
import { expandHome } from '../config/config.js';
|
||||
|
||||
interface StateData {
|
||||
processedMessageIds: string[];
|
||||
enrichment: Record<string, string>;
|
||||
}
|
||||
|
||||
export class StateStore {
|
||||
public constructor(private readonly path: string) {}
|
||||
|
||||
private async read(): Promise<StateData> {
|
||||
try {
|
||||
return JSON.parse(await readFile(expandHome(this.path), 'utf8')) as StateData;
|
||||
} catch {
|
||||
return { processedMessageIds: [], enrichment: {} };
|
||||
}
|
||||
}
|
||||
|
||||
private async write(state: StateData): Promise<void> {
|
||||
const path = expandHome(this.path);
|
||||
await mkdir(dirname(path), { recursive: true });
|
||||
await writeFile(path, `${JSON.stringify(state, null, 2)}\n`);
|
||||
}
|
||||
|
||||
public async isProcessed(messageId: string): Promise<boolean> {
|
||||
return (await this.read()).processedMessageIds.includes(messageId);
|
||||
}
|
||||
|
||||
public async markProcessed(messageId: string): Promise<void> {
|
||||
const state = await this.read();
|
||||
if (!state.processedMessageIds.includes(messageId)) {
|
||||
state.processedMessageIds.push(messageId);
|
||||
await this.write(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user