feature: First push to git

This commit is contained in:
Keith Solomon
2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
+18
View File
@@ -0,0 +1,18 @@
import { describe, expect, it } from 'vitest';
import { Categorizer } from '../src/categorization/categorizer.js';
describe('categorization', () => {
it('prefers newsletter section headers', async () => {
const categorizer = new Categorizer();
await expect(
categorizer.categorize({ title: 'Anything', url: 'https://x.test', section: 'Rust' })
).resolves.toBe('Rust');
});
it('falls back to URL and keyword rules', async () => {
const categorizer = new Categorizer();
await expect(
categorizer.categorize({ title: 'Kubernetes security guide', url: 'https://example.com/k8s' })
).resolves.toBe('DevOps');
});
});
+27
View File
@@ -0,0 +1,27 @@
import { describe, expect, it } from 'vitest';
import { validateDateFilters } from '../src/cli/flags.js';
import { loadConfigFromString } from '../src/config/config.js';
describe('config validation', () => {
it('loads a valid YAML config with defaults', () => {
const config = loadConfigFromString(`
gmail:
folder: Newsletters
output:
name: Newsletter Link Catalog
excel:
enabled: true
path: ./output/catalog.xlsx
`);
expect(config.gmail.folder).toBe('Newsletters');
expect(config.links.trackingParams).toContain('utm_*');
expect(config.enrichment.concurrency).toBe(3);
});
it('rejects conflicting relative and absolute date filters', () => {
expect(() => validateDateFilters({ last: '30d', from: '2026-01-01' })).toThrow(
/cannot be combined/i
);
});
});
+28
View File
@@ -0,0 +1,28 @@
import { describe, expect, it } from 'vitest';
import { enrichLink } from '../src/enrichment/enricher.js';
describe('enrichment', () => {
it('marks dead, paywall, and unreachable links', async () => {
await expect(
enrichLink('https://x.test/dead', async () => ({
status: 404,
finalUrl: 'https://x.test/dead',
html: ''
}))
).resolves.toMatchObject({
status: 'dead'
});
await expect(
enrichLink('https://x.test/a', async () => ({
status: 200,
finalUrl: 'https://x.test/login',
html: '<title>Login</title>'
}))
).resolves.toMatchObject({ status: 'paywall' });
await expect(
enrichLink('https://x.test/a', async () => Promise.reject(new Error('timeout')))
).resolves.toMatchObject({
status: 'unreachable'
});
});
});
+26
View File
@@ -0,0 +1,26 @@
import { describe, expect, it } from 'vitest';
import { isNoiseLink, isSponsorLink } from '../src/links/filtering.js';
describe('noise filtering', () => {
it('filters unsubscribe, footer social, share, and mirror links', () => {
expect(isNoiseLink({ url: 'https://x.test/unsubscribe', title: 'Unsubscribe' })).toBe(true);
expect(
isNoiseLink({ url: 'https://twitter.com/me', title: 'Twitter', context: 'footer' })
).toBe(true);
expect(isNoiseLink({ url: 'https://x.test/share', title: 'Share this newsletter' })).toBe(true);
expect(isNoiseLink({ url: 'https://x.test/view', title: 'View in browser' })).toBe(true);
});
});
describe('sponsor detection', () => {
it('detects sponsor links from section and surrounding text', () => {
expect(
isSponsorLink({
url: 'https://sponsor.example',
title: 'Acme',
section: 'Sponsored',
context: 'Partner message'
})
).toBe(true);
});
});
+13
View File
@@ -0,0 +1,13 @@
import { describe, expect, it } from 'vitest';
import { selectParser } from '../src/parsing/plugins.js';
describe('parser plugin selection', () => {
it('selects Substack for Substack headers and generic otherwise', () => {
expect(selectParser({ headers: { listId: 'thing.substack.com' }, html: '' }).name).toBe(
'substack'
);
expect(
selectParser({ headers: {}, html: '<a href="https://example.com">Example</a>' }).name
).toBe('generic');
});
});
+44
View File
@@ -0,0 +1,44 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { runCatalog } from '../src/run/runCatalog.js';
let dir = '';
beforeEach(async () => {
dir = await mkdtemp(join(tmpdir(), 'nlc-run-'));
});
afterEach(async () => {
await rm(dir, { force: true, recursive: true });
});
describe('run orchestration', () => {
it('does not write output or state during dry run', async () => {
const stateFile = join(dir, 'state.json');
const writes: unknown[] = [];
const result = await runCatalog({
dryRun: 1,
skipEnrich: true,
config: {
gmail: { folder: 'Newsletters' },
output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
stateFile
},
messages: [
{
id: 'msg-1',
messageId: '<msg-1>',
from: 'A <a@example.com>',
date: '2026-05-16T00:00:00.000Z',
html: '<h2>Python</h2><p><a href="https://example.com?utm_source=x">Article</a></p>'
}
],
writers: [{ write: async (payload) => writes.push(payload) }]
});
expect(result.linksExtracted).toBe(1);
expect(writes).toHaveLength(0);
});
});
+15
View File
@@ -0,0 +1,15 @@
import { describe, expect, it } from 'vitest';
import { escapeCell, sanitizeSheetName } from '../src/output/sheets.js';
describe('sheet output helpers', () => {
it('sanitizes and truncates sheet names', () => {
const name = sanitizeSheetName('Bad:/\\\\?*[] name '.repeat(12));
expect(name).not.toMatch(/[:/\\?*[\]]/);
expect(name.length).toBeLessThanOrEqual(100);
});
it('escapes formula-like cell values', () => {
expect(escapeCell('=IMPORTXML("http://bad")')).toBe('\'=IMPORTXML("http://bad")');
});
});
+25
View File
@@ -0,0 +1,25 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { StateStore } from '../src/state/state.js';
let dir = '';
beforeEach(async () => {
dir = await mkdtemp(join(tmpdir(), 'nlc-'));
});
afterEach(async () => {
await rm(dir, { force: true, recursive: true });
});
describe('state persistence', () => {
it('tracks processed messages incrementally', async () => {
const store = new StateStore(join(dir, 'state.json'));
expect(await store.isProcessed('msg-1')).toBe(false);
await store.markProcessed('msg-1');
expect(await store.isProcessed('msg-1')).toBe(true);
});
});
+38
View File
@@ -0,0 +1,38 @@
import { describe, expect, it } from 'vitest';
import { cleanupUrl, mergeReadMoreLinks } from '../src/links/url.js';
import { ExtractedLink } from '../src/parsing/types.js';
describe('URL cleanup', () => {
it('strips tracking parameters and unwraps supported redirect URLs', () => {
const result = cleanupUrl(
'https://newsletter.example/redirect?url=https%3A%2F%2Fexample.com%2Fpost%3Futm_source%3Dx%26id%3D1&mc_cid=abc',
{ trackingParams: ['utm_*', 'mc_cid'], unwrapRedirects: true }
);
expect(result).toBe('https://example.com/post?id=1');
});
});
describe('read-more merging', () => {
it('merges a read-more link into the preceding link with the same normalized URL', () => {
const links: ExtractedLink[] = [
{
url: 'https://example.com/a',
normalizedUrl: 'https://example.com/a',
title: 'Great article',
description: 'A useful summary',
sourceText: 'Great article',
section: 'Python'
},
{
url: 'https://example.com/a?utm_source=x',
normalizedUrl: 'https://example.com/a',
title: 'Read more',
description: '',
sourceText: 'Read more'
}
];
expect(mergeReadMoreLinks(links, /^(read more)$/i)).toHaveLength(1);
});
});