feature: First push to git

This commit is contained in:
Keith Solomon
2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
+117
View File
@@ -0,0 +1,117 @@
import { normalizeConfig, PartialConfig } from '../config/config.js';
import { Categorizer } from '../categorization/categorizer.js';
import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
import { OutputWriter } from '../output/sheets.js';
import { selectParser } from '../parsing/plugins.js';
import { NewsletterMessage } from '../parsing/types.js';
import { StateStore } from '../state/state.js';
export interface RunOptions {
config: PartialConfig;
messages: NewsletterMessage[];
writers: OutputWriter[];
dryRun?: number | boolean;
full?: boolean;
skipEnrich?: boolean;
enrichOnly?: boolean;
verbose?: boolean;
}
export interface RunSummary {
newslettersProcessed: number;
linksExtracted: number;
sponsors: number;
deadLinks: number;
errors: number;
}
function newsletterName(from: string): string {
const match = from.match(/^(.*?)\s*</);
return (match?.[1] || from).replace(/^"|"$/g, '').trim();
}
function issueDate(date: string): string {
return new Date(date).toISOString().slice(0, 10);
}
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
const config = normalizeConfig(options.config);
const state = new StateStore(config.stateFile);
const categorizer = new Categorizer(config.categories.custom);
const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
const messages = limit ? options.messages.slice(0, limit) : options.messages;
const rows: Record<string, unknown>[] = [];
const sponsors: Record<string, unknown>[] = [];
let errors = 0;
for (const message of messages) {
if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
continue;
}
try {
const parser = selectParser({ html: message.html, headers: message.headers });
const parsed = parser.parse({ html: message.html, headers: message.headers });
const cleaned = parsed
.filter((link) => !isNoiseLink(link))
.map((link) => ({
...link,
normalizedUrl: cleanupUrl(link.url, {
trackingParams: config.links.trackingParams,
unwrapRedirects: config.links.unwrapRedirects
})
}));
const merged = config.links.mergeReadMore
? mergeReadMoreLinks(
cleaned,
new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
)
: cleaned;
const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
for (const link of unique) {
if (isSponsorLink(link)) {
sponsors.push({
Newsletter: newsletterName(message.from),
Sponsor: link.title,
Link: link.normalizedUrl,
Description: link.description ?? ''
});
continue;
}
rows.push({
'Issue Date': issueDate(message.date),
Category: await categorizer.categorize(link),
'Link URL': link.normalizedUrl,
Title: link.title,
Description: link.description ?? '',
'Page Title + Meta': '',
'Source Newsletter': newsletterName(message.from),
'Also In': ''
});
}
if (!options.dryRun) {
await state.markProcessed(message.messageId);
}
} catch {
errors += 1;
}
}
if (!options.dryRun) {
for (const writer of options.writers) {
await writer.write({ rows, sponsors, deadLinks: [] });
}
}
return {
newslettersProcessed: messages.length,
linksExtracted: rows.length,
sponsors: sponsors.length,
deadLinks: 0,
errors
};
}