✨feature: First push to git
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { Categorizer } from '../src/categorization/categorizer.js';
|
||||
|
||||
describe('categorization', () => {
|
||||
it('prefers newsletter section headers', async () => {
|
||||
const categorizer = new Categorizer();
|
||||
await expect(
|
||||
categorizer.categorize({ title: 'Anything', url: 'https://x.test', section: 'Rust' })
|
||||
).resolves.toBe('Rust');
|
||||
});
|
||||
|
||||
it('falls back to URL and keyword rules', async () => {
|
||||
const categorizer = new Categorizer();
|
||||
await expect(
|
||||
categorizer.categorize({ title: 'Kubernetes security guide', url: 'https://example.com/k8s' })
|
||||
).resolves.toBe('DevOps');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,27 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { validateDateFilters } from '../src/cli/flags.js';
|
||||
import { loadConfigFromString } from '../src/config/config.js';
|
||||
|
||||
describe('config validation', () => {
|
||||
it('loads a valid YAML config with defaults', () => {
|
||||
const config = loadConfigFromString(`
|
||||
gmail:
|
||||
folder: Newsletters
|
||||
output:
|
||||
name: Newsletter Link Catalog
|
||||
excel:
|
||||
enabled: true
|
||||
path: ./output/catalog.xlsx
|
||||
`);
|
||||
|
||||
expect(config.gmail.folder).toBe('Newsletters');
|
||||
expect(config.links.trackingParams).toContain('utm_*');
|
||||
expect(config.enrichment.concurrency).toBe(3);
|
||||
});
|
||||
|
||||
it('rejects conflicting relative and absolute date filters', () => {
|
||||
expect(() => validateDateFilters({ last: '30d', from: '2026-01-01' })).toThrow(
|
||||
/cannot be combined/i
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,28 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { enrichLink } from '../src/enrichment/enricher.js';
|
||||
|
||||
describe('enrichment', () => {
|
||||
it('marks dead, paywall, and unreachable links', async () => {
|
||||
await expect(
|
||||
enrichLink('https://x.test/dead', async () => ({
|
||||
status: 404,
|
||||
finalUrl: 'https://x.test/dead',
|
||||
html: ''
|
||||
}))
|
||||
).resolves.toMatchObject({
|
||||
status: 'dead'
|
||||
});
|
||||
await expect(
|
||||
enrichLink('https://x.test/a', async () => ({
|
||||
status: 200,
|
||||
finalUrl: 'https://x.test/login',
|
||||
html: '<title>Login</title>'
|
||||
}))
|
||||
).resolves.toMatchObject({ status: 'paywall' });
|
||||
await expect(
|
||||
enrichLink('https://x.test/a', async () => Promise.reject(new Error('timeout')))
|
||||
).resolves.toMatchObject({
|
||||
status: 'unreachable'
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,26 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { isNoiseLink, isSponsorLink } from '../src/links/filtering.js';
|
||||
|
||||
describe('noise filtering', () => {
|
||||
it('filters unsubscribe, footer social, share, and mirror links', () => {
|
||||
expect(isNoiseLink({ url: 'https://x.test/unsubscribe', title: 'Unsubscribe' })).toBe(true);
|
||||
expect(
|
||||
isNoiseLink({ url: 'https://twitter.com/me', title: 'Twitter', context: 'footer' })
|
||||
).toBe(true);
|
||||
expect(isNoiseLink({ url: 'https://x.test/share', title: 'Share this newsletter' })).toBe(true);
|
||||
expect(isNoiseLink({ url: 'https://x.test/view', title: 'View in browser' })).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('sponsor detection', () => {
|
||||
it('detects sponsor links from section and surrounding text', () => {
|
||||
expect(
|
||||
isSponsorLink({
|
||||
url: 'https://sponsor.example',
|
||||
title: 'Acme',
|
||||
section: 'Sponsored',
|
||||
context: 'Partner message'
|
||||
})
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,13 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { selectParser } from '../src/parsing/plugins.js';
|
||||
|
||||
describe('parser plugin selection', () => {
|
||||
it('selects Substack for Substack headers and generic otherwise', () => {
|
||||
expect(selectParser({ headers: { listId: 'thing.substack.com' }, html: '' }).name).toBe(
|
||||
'substack'
|
||||
);
|
||||
expect(
|
||||
selectParser({ headers: {}, html: '<a href="https://example.com">Example</a>' }).name
|
||||
).toBe('generic');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,44 @@
|
||||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { runCatalog } from '../src/run/runCatalog.js';
|
||||
|
||||
let dir = '';
|
||||
|
||||
beforeEach(async () => {
|
||||
dir = await mkdtemp(join(tmpdir(), 'nlc-run-'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(dir, { force: true, recursive: true });
|
||||
});
|
||||
|
||||
describe('run orchestration', () => {
|
||||
it('does not write output or state during dry run', async () => {
|
||||
const stateFile = join(dir, 'state.json');
|
||||
const writes: unknown[] = [];
|
||||
const result = await runCatalog({
|
||||
dryRun: 1,
|
||||
skipEnrich: true,
|
||||
config: {
|
||||
gmail: { folder: 'Newsletters' },
|
||||
output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
|
||||
stateFile
|
||||
},
|
||||
messages: [
|
||||
{
|
||||
id: 'msg-1',
|
||||
messageId: '<msg-1>',
|
||||
from: 'A <a@example.com>',
|
||||
date: '2026-05-16T00:00:00.000Z',
|
||||
html: '<h2>Python</h2><p><a href="https://example.com?utm_source=x">Article</a></p>'
|
||||
}
|
||||
],
|
||||
writers: [{ write: async (payload) => writes.push(payload) }]
|
||||
});
|
||||
|
||||
expect(result.linksExtracted).toBe(1);
|
||||
expect(writes).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,15 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { escapeCell, sanitizeSheetName } from '../src/output/sheets.js';
|
||||
|
||||
describe('sheet output helpers', () => {
|
||||
it('sanitizes and truncates sheet names', () => {
|
||||
const name = sanitizeSheetName('Bad:/\\\\?*[] name '.repeat(12));
|
||||
|
||||
expect(name).not.toMatch(/[:/\\?*[\]]/);
|
||||
expect(name.length).toBeLessThanOrEqual(100);
|
||||
});
|
||||
|
||||
it('escapes formula-like cell values', () => {
|
||||
expect(escapeCell('=IMPORTXML("http://bad")')).toBe('\'=IMPORTXML("http://bad")');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,25 @@
|
||||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { StateStore } from '../src/state/state.js';
|
||||
|
||||
let dir = '';
|
||||
|
||||
beforeEach(async () => {
|
||||
dir = await mkdtemp(join(tmpdir(), 'nlc-'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(dir, { force: true, recursive: true });
|
||||
});
|
||||
|
||||
describe('state persistence', () => {
|
||||
it('tracks processed messages incrementally', async () => {
|
||||
const store = new StateStore(join(dir, 'state.json'));
|
||||
|
||||
expect(await store.isProcessed('msg-1')).toBe(false);
|
||||
await store.markProcessed('msg-1');
|
||||
expect(await store.isProcessed('msg-1')).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,38 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { cleanupUrl, mergeReadMoreLinks } from '../src/links/url.js';
|
||||
import { ExtractedLink } from '../src/parsing/types.js';
|
||||
|
||||
describe('URL cleanup', () => {
|
||||
it('strips tracking parameters and unwraps supported redirect URLs', () => {
|
||||
const result = cleanupUrl(
|
||||
'https://newsletter.example/redirect?url=https%3A%2F%2Fexample.com%2Fpost%3Futm_source%3Dx%26id%3D1&mc_cid=abc',
|
||||
{ trackingParams: ['utm_*', 'mc_cid'], unwrapRedirects: true }
|
||||
);
|
||||
|
||||
expect(result).toBe('https://example.com/post?id=1');
|
||||
});
|
||||
});
|
||||
|
||||
describe('read-more merging', () => {
|
||||
it('merges a read-more link into the preceding link with the same normalized URL', () => {
|
||||
const links: ExtractedLink[] = [
|
||||
{
|
||||
url: 'https://example.com/a',
|
||||
normalizedUrl: 'https://example.com/a',
|
||||
title: 'Great article',
|
||||
description: 'A useful summary',
|
||||
sourceText: 'Great article',
|
||||
section: 'Python'
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/a?utm_source=x',
|
||||
normalizedUrl: 'https://example.com/a',
|
||||
title: 'Read more',
|
||||
description: '',
|
||||
sourceText: 'Read more'
|
||||
}
|
||||
];
|
||||
|
||||
expect(mergeReadMoreLinks(links, /^(read more)$/i)).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user