✨feature: First push to git

2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
@@ -0,0 +1,18 @@
+import { describe, expect, it } from 'vitest';
+import { Categorizer } from '../src/categorization/categorizer.js';
+
+describe('categorization', () => {
+  it('prefers newsletter section headers', async () => {
+    const categorizer = new Categorizer();
+    await expect(
+      categorizer.categorize({ title: 'Anything', url: 'https://x.test', section: 'Rust' })
+    ).resolves.toBe('Rust');
+  });
+
+  it('falls back to URL and keyword rules', async () => {
+    const categorizer = new Categorizer();
+    await expect(
+      categorizer.categorize({ title: 'Kubernetes security guide', url: 'https://example.com/k8s' })
+    ).resolves.toBe('DevOps');
+  });
+});
@@ -0,0 +1,27 @@
+import { describe, expect, it } from 'vitest';
+import { validateDateFilters } from '../src/cli/flags.js';
+import { loadConfigFromString } from '../src/config/config.js';
+
+describe('config validation', () => {
+  it('loads a valid YAML config with defaults', () => {
+    const config = loadConfigFromString(`
+gmail:
+  folder: Newsletters
+output:
+  name: Newsletter Link Catalog
+  excel:
+    enabled: true
+    path: ./output/catalog.xlsx
+`);
+
+    expect(config.gmail.folder).toBe('Newsletters');
+    expect(config.links.trackingParams).toContain('utm_*');
+    expect(config.enrichment.concurrency).toBe(3);
+  });
+
+  it('rejects conflicting relative and absolute date filters', () => {
+    expect(() => validateDateFilters({ last: '30d', from: '2026-01-01' })).toThrow(
+      /cannot be combined/i
+    );
+  });
+});
@@ -0,0 +1,28 @@
+import { describe, expect, it } from 'vitest';
+import { enrichLink } from '../src/enrichment/enricher.js';
+
+describe('enrichment', () => {
+  it('marks dead, paywall, and unreachable links', async () => {
+    await expect(
+      enrichLink('https://x.test/dead', async () => ({
+        status: 404,
+        finalUrl: 'https://x.test/dead',
+        html: ''
+      }))
+    ).resolves.toMatchObject({
+      status: 'dead'
+    });
+    await expect(
+      enrichLink('https://x.test/a', async () => ({
+        status: 200,
+        finalUrl: 'https://x.test/login',
+        html: '<title>Login</title>'
+      }))
+    ).resolves.toMatchObject({ status: 'paywall' });
+    await expect(
+      enrichLink('https://x.test/a', async () => Promise.reject(new Error('timeout')))
+    ).resolves.toMatchObject({
+      status: 'unreachable'
+    });
+  });
+});
@@ -0,0 +1,26 @@
+import { describe, expect, it } from 'vitest';
+import { isNoiseLink, isSponsorLink } from '../src/links/filtering.js';
+
+describe('noise filtering', () => {
+  it('filters unsubscribe, footer social, share, and mirror links', () => {
+    expect(isNoiseLink({ url: 'https://x.test/unsubscribe', title: 'Unsubscribe' })).toBe(true);
+    expect(
+      isNoiseLink({ url: 'https://twitter.com/me', title: 'Twitter', context: 'footer' })
+    ).toBe(true);
+    expect(isNoiseLink({ url: 'https://x.test/share', title: 'Share this newsletter' })).toBe(true);
+    expect(isNoiseLink({ url: 'https://x.test/view', title: 'View in browser' })).toBe(true);
+  });
+});
+
+describe('sponsor detection', () => {
+  it('detects sponsor links from section and surrounding text', () => {
+    expect(
+      isSponsorLink({
+        url: 'https://sponsor.example',
+        title: 'Acme',
+        section: 'Sponsored',
+        context: 'Partner message'
+      })
+    ).toBe(true);
+  });
+});
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+import { selectParser } from '../src/parsing/plugins.js';
+
+describe('parser plugin selection', () => {
+  it('selects Substack for Substack headers and generic otherwise', () => {
+    expect(selectParser({ headers: { listId: 'thing.substack.com' }, html: '' }).name).toBe(
+      'substack'
+    );
+    expect(
+      selectParser({ headers: {}, html: '<a href="https://example.com">Example</a>' }).name
+    ).toBe('generic');
+  });
+});
@@ -0,0 +1,44 @@
+import { mkdtemp, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+import { runCatalog } from '../src/run/runCatalog.js';
+
+let dir = '';
+
+beforeEach(async () => {
+  dir = await mkdtemp(join(tmpdir(), 'nlc-run-'));
+});
+
+afterEach(async () => {
+  await rm(dir, { force: true, recursive: true });
+});
+
+describe('run orchestration', () => {
+  it('does not write output or state during dry run', async () => {
+    const stateFile = join(dir, 'state.json');
+    const writes: unknown[] = [];
+    const result = await runCatalog({
+      dryRun: 1,
+      skipEnrich: true,
+      config: {
+        gmail: { folder: 'Newsletters' },
+        output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
+        stateFile
+      },
+      messages: [
+        {
+          id: 'msg-1',
+          messageId: '<msg-1>',
+          from: 'A <a@example.com>',
+          date: '2026-05-16T00:00:00.000Z',
+          html: '<h2>Python</h2><p><a href="https://example.com?utm_source=x">Article</a></p>'
+        }
+      ],
+      writers: [{ write: async (payload) => writes.push(payload) }]
+    });
+
+    expect(result.linksExtracted).toBe(1);
+    expect(writes).toHaveLength(0);
+  });
+});
@@ -0,0 +1,15 @@
+import { describe, expect, it } from 'vitest';
+import { escapeCell, sanitizeSheetName } from '../src/output/sheets.js';
+
+describe('sheet output helpers', () => {
+  it('sanitizes and truncates sheet names', () => {
+    const name = sanitizeSheetName('Bad:/\\\\?*[] name '.repeat(12));
+
+    expect(name).not.toMatch(/[:/\\?*[\]]/);
+    expect(name.length).toBeLessThanOrEqual(100);
+  });
+
+  it('escapes formula-like cell values', () => {
+    expect(escapeCell('=IMPORTXML("http://bad")')).toBe('\'=IMPORTXML("http://bad")');
+  });
+});
@@ -0,0 +1,25 @@
+import { mkdtemp, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+import { StateStore } from '../src/state/state.js';
+
+let dir = '';
+
+beforeEach(async () => {
+  dir = await mkdtemp(join(tmpdir(), 'nlc-'));
+});
+
+afterEach(async () => {
+  await rm(dir, { force: true, recursive: true });
+});
+
+describe('state persistence', () => {
+  it('tracks processed messages incrementally', async () => {
+    const store = new StateStore(join(dir, 'state.json'));
+
+    expect(await store.isProcessed('msg-1')).toBe(false);
+    await store.markProcessed('msg-1');
+    expect(await store.isProcessed('msg-1')).toBe(true);
+  });
+});
@@ -0,0 +1,38 @@
+import { describe, expect, it } from 'vitest';
+import { cleanupUrl, mergeReadMoreLinks } from '../src/links/url.js';
+import { ExtractedLink } from '../src/parsing/types.js';
+
+describe('URL cleanup', () => {
+  it('strips tracking parameters and unwraps supported redirect URLs', () => {
+    const result = cleanupUrl(
+      'https://newsletter.example/redirect?url=https%3A%2F%2Fexample.com%2Fpost%3Futm_source%3Dx%26id%3D1&mc_cid=abc',
+      { trackingParams: ['utm_*', 'mc_cid'], unwrapRedirects: true }
+    );
+
+    expect(result).toBe('https://example.com/post?id=1');
+  });
+});
+
+describe('read-more merging', () => {
+  it('merges a read-more link into the preceding link with the same normalized URL', () => {
+    const links: ExtractedLink[] = [
+      {
+        url: 'https://example.com/a',
+        normalizedUrl: 'https://example.com/a',
+        title: 'Great article',
+        description: 'A useful summary',
+        sourceText: 'Great article',
+        section: 'Python'
+      },
+      {
+        url: 'https://example.com/a?utm_source=x',
+        normalizedUrl: 'https://example.com/a',
+        title: 'Read more',
+        description: '',
+        sourceText: 'Read more'
+      }
+    ];
+
+    expect(mergeReadMoreLinks(links, /^(read more)$/i)).toHaveLength(1);
+  });
+});