✨feature: First push to git
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
import { normalizeConfig, PartialConfig } from '../config/config.js';
|
||||
import { Categorizer } from '../categorization/categorizer.js';
|
||||
import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
|
||||
import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
|
||||
import { OutputWriter } from '../output/sheets.js';
|
||||
import { selectParser } from '../parsing/plugins.js';
|
||||
import { NewsletterMessage } from '../parsing/types.js';
|
||||
import { StateStore } from '../state/state.js';
|
||||
|
||||
export interface RunOptions {
|
||||
config: PartialConfig;
|
||||
messages: NewsletterMessage[];
|
||||
writers: OutputWriter[];
|
||||
dryRun?: number | boolean;
|
||||
full?: boolean;
|
||||
skipEnrich?: boolean;
|
||||
enrichOnly?: boolean;
|
||||
verbose?: boolean;
|
||||
}
|
||||
|
||||
export interface RunSummary {
|
||||
newslettersProcessed: number;
|
||||
linksExtracted: number;
|
||||
sponsors: number;
|
||||
deadLinks: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
function newsletterName(from: string): string {
|
||||
const match = from.match(/^(.*?)\s*</);
|
||||
return (match?.[1] || from).replace(/^"|"$/g, '').trim();
|
||||
}
|
||||
|
||||
function issueDate(date: string): string {
|
||||
return new Date(date).toISOString().slice(0, 10);
|
||||
}
|
||||
|
||||
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
|
||||
const config = normalizeConfig(options.config);
|
||||
const state = new StateStore(config.stateFile);
|
||||
const categorizer = new Categorizer(config.categories.custom);
|
||||
const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
|
||||
const messages = limit ? options.messages.slice(0, limit) : options.messages;
|
||||
const rows: Record<string, unknown>[] = [];
|
||||
const sponsors: Record<string, unknown>[] = [];
|
||||
let errors = 0;
|
||||
|
||||
for (const message of messages) {
|
||||
if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const parser = selectParser({ html: message.html, headers: message.headers });
|
||||
const parsed = parser.parse({ html: message.html, headers: message.headers });
|
||||
const cleaned = parsed
|
||||
.filter((link) => !isNoiseLink(link))
|
||||
.map((link) => ({
|
||||
...link,
|
||||
normalizedUrl: cleanupUrl(link.url, {
|
||||
trackingParams: config.links.trackingParams,
|
||||
unwrapRedirects: config.links.unwrapRedirects
|
||||
})
|
||||
}));
|
||||
const merged = config.links.mergeReadMore
|
||||
? mergeReadMoreLinks(
|
||||
cleaned,
|
||||
new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
|
||||
)
|
||||
: cleaned;
|
||||
const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
|
||||
|
||||
for (const link of unique) {
|
||||
if (isSponsorLink(link)) {
|
||||
sponsors.push({
|
||||
Newsletter: newsletterName(message.from),
|
||||
Sponsor: link.title,
|
||||
Link: link.normalizedUrl,
|
||||
Description: link.description ?? ''
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
rows.push({
|
||||
'Issue Date': issueDate(message.date),
|
||||
Category: await categorizer.categorize(link),
|
||||
'Link URL': link.normalizedUrl,
|
||||
Title: link.title,
|
||||
Description: link.description ?? '',
|
||||
'Page Title + Meta': '',
|
||||
'Source Newsletter': newsletterName(message.from),
|
||||
'Also In': ''
|
||||
});
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
await state.markProcessed(message.messageId);
|
||||
}
|
||||
} catch {
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
for (const writer of options.writers) {
|
||||
await writer.write({ rows, sponsors, deadLinks: [] });
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
newslettersProcessed: messages.length,
|
||||
linksExtracted: rows.length,
|
||||
sponsors: sponsors.length,
|
||||
deadLinks: 0,
|
||||
errors
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user