feature: First push to git

This commit is contained in:
Keith Solomon
2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
+49
View File
@@ -0,0 +1,49 @@
module.exports = {
root: true,
ignorePatterns: ['dist/**', 'node_modules/**'],
env: {
es2022: true,
node: true
},
parser: '@typescript-eslint/parser',
parserOptions: {
project: './tsconfig.json',
sourceType: 'module'
},
plugins: ['@typescript-eslint', 'import'],
extends: ['airbnb-base', 'plugin:@typescript-eslint/recommended', 'prettier'],
rules: {
'import/extensions': 'off',
'import/no-extraneous-dependencies': [
'error',
{
devDependencies: ['tests/**/*.ts', 'vitest.config.ts', 'tsup.config.ts', 'scripts/**/*.mjs']
}
],
'no-console': 'off',
'no-restricted-syntax': 'off',
'class-methods-use-this': 'off',
camelcase: 'off',
'default-param-last': 'off',
'import/no-unresolved': 'off',
'import/prefer-default-export': 'off',
'max-classes-per-file': 'off',
'no-await-in-loop': 'off',
'no-continue': 'off',
'no-empty-function': 'off',
'no-use-before-define': 'off',
'no-useless-constructor': 'off',
'@typescript-eslint/no-explicit-any': 'off',
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }]
},
overrides: [
{
files: ['*.js', '*.cjs', '*.mjs'],
parser: 'espree',
parserOptions: {
ecmaVersion: 2022,
sourceType: 'module'
}
}
]
};
+3
View File
@@ -0,0 +1,3 @@
node_modules
dist
config.yaml
+7
View File
@@ -0,0 +1,7 @@
.git
node_modules
dist
.vscode
notes
PROMPT.md
SPEC.md
+5
View File
@@ -0,0 +1,5 @@
{
"singleQuote": true,
"trailingComma": "none",
"printWidth": 100
}
+16
View File
@@ -0,0 +1,16 @@
{
"workbench.colorCustomizations": {
"tree.indentGuidesStroke": "#3d92ec",
"activityBar.background": "#053610",
"titleBar.activeBackground": "#074B17",
"titleBar.activeForeground": "#EEFDF1",
"titleBar.inactiveBackground": "#053610",
"titleBar.inactiveForeground": "#EEFDF1",
"statusBar.background": "#053610",
"statusBar.foreground": "#EEFDF1",
"statusBar.debuggingBackground": "#053610",
"statusBar.debuggingForeground": "#EEFDF1",
"statusBar.noFolderBackground": "#053610",
"statusBar.noFolderForeground": "#EEFDF1"
}
}
+68
View File
@@ -0,0 +1,68 @@
# Newsletter Link Catalog
`nlc` is a TypeScript/Node.js CLI for cataloging links from newsletters in a configured Gmail label into Google Sheets and/or a local Excel workbook.
## Commands
```bash
nlc init
nlc run --dry-run
nlc run
nlc run --from 2026-05-01 --to 2026-05-16
nlc run --last 30d
nlc run --enrich-only
```
## Setup
1. Install dependencies with `npm install`.
2. Run `npm run build`.
3. Run `node dist/index.js init` to create `config.yaml`.
4. Place OAuth client JSON files in the configured local paths, typically:
- `~/.nlc/gmail-credentials.json`
- `~/.nlc/sheets-credentials.json`
5. Run `node dist/index.js run --dry-run` before live writes.
Tokens are persisted locally under `~/.nlc` and must not be committed.
## Configuration
Start from [config.example.yaml](config.example.yaml). The important choices are:
- `gmail.folder`: the single Gmail label/folder to process.
- `output.excel.enabled`: writes a local `.xlsx` file.
- `output.sheets_api.enabled`: enables Google Sheets integration when credentials and spreadsheet ID are configured.
- `links.tracking_params`: query parameters stripped during URL normalization.
- `categories.llm`: optional BYOK categorization provider.
## Build and Distribution
The build uses `tsup` for the JavaScript bundle and `@yao-pkg/pkg` for the standalone executable:
```bash
npm run build
```
This bundles `src/index.ts` to `dist/index.js`, adds a Node shebang, emits types, and packages the current-platform executable as `dist/nlc.exe` on Windows or `dist/nlc` on macOS/Linux. The packaged artifact embeds the Node runtime for operational use without a separate Node install.
## Validation
Local validation does not need Gmail, Sheets, or LLM credentials:
```bash
npm run lint
npm run format:check
npm run typecheck
npm test
npm run build
npm run smoke
```
`npm run smoke` exercises `nlc --help`, `nlc init --help`, `nlc run --help`, and a fixture-backed dry run.
## Safety Notes
- Formula-like spreadsheet cells are escaped before output.
- Dry runs do not write output files or state.
- Live integrations are isolated behind adapters so tests use fakes.
- Individual email/link failures are counted and processing continues; critical config/write failures stop the command.
+74
View File
@@ -0,0 +1,74 @@
gmail:
folder: 'Newsletters'
credentials: '~/.nlc/gmail-credentials.json'
token: '~/.nlc/gmail-token.json'
output:
name: 'Newsletter Link Catalog'
sheets_api:
enabled: false
credentials: '~/.nlc/sheets-credentials.json'
token: '~/.nlc/sheets-token.json'
spreadsheet_id: ''
excel:
enabled: true
path: './output/newsletter-catalog.xlsx'
newsletters:
'sender@example.com':
display_name: 'Example Newsletter'
date_override: 'subject'
date_format: '%B %d, %Y'
links:
unwrap_redirects: true
strip_utm: true
tracking_params:
- 'utm_*'
- 'fbclid'
- 'gclid'
- 'mc_cid'
- 'mc_eid'
redirect_limit: 5
read_more_pattern: '(?i)^(read more|continue reading|learn more)$'
share_patterns:
- '(?i)share'
- '(?i)forward to a friend'
sponsor_markers:
- '(?i)sponsor'
- '(?i)sponsored'
- '(?i)advertisement'
- '(?i)partner'
filter_unsubscribe: true
filter_social_footer: true
filter_share_links: true
merge_read_more: true
categories:
custom:
- 'AI/ML'
- 'Career'
- 'Rust'
llm:
provider: 'anthropic'
model: 'claude-sonnet-4-6'
api_key_env: 'ANTHROPIC_API_KEY'
base_url: null
failure_category: 'Uncategorized'
enrichment:
enabled: true
concurrency: 3
delay_ms: 1500
retries: 2
timeout_ms: 10000
rate_limit:
gmail_qps: 5
link_concurrency: 3
state_file: '~/.nlc/state.json'
plugins:
substack:
enabled: true
@@ -0,0 +1,63 @@
# Newsletter Link Catalog Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Build a production-quality TypeScript CLI named `nlc` that extracts newsletter links from Gmail, cleans and categorizes them, enriches metadata, writes Google Sheets and Excel outputs, and supports credential-free local validation.
**Architecture:** The CLI is split into small modules for command parsing, config validation, Gmail access, parsing plugins, link cleanup, categorization, LLM adapters, output writers, enrichment, state, and orchestration. Live external services are isolated behind interfaces so tests and smoke runs use fixtures and fakes.
**Tech Stack:** Node.js, TypeScript, Commander, Cheerio, Zod, googleapis, xlsx, Vitest, ESLint, Prettier, tsup.
---
### Task 1: Scaffold and Tests
**Files:**
- Create: `package.json`, `tsconfig.json`, `.eslintrc.cjs`, `.prettierrc.json`, `vitest.config.ts`
- Create tests in `tests/*.test.ts`
- [x] Write tests covering config validation, date conflicts, sheet names, URL cleanup, read-more merging, noise filtering, sponsor detection, categorization, state behavior, enrichment statuses, dry-run suppression, and parser plugin selection.
- [x] Run tests once to confirm they fail before production modules exist.
### Task 2: Core Modules
**Files:**
- Create modules under `src/config`, `src/links`, `src/parsing`, `src/categorization`, `src/state`, `src/enrichment`, `src/output`
- [x] Implement the minimal production behavior required for the tests.
- [x] Keep integrations behind interfaces and dependency injection.
### Task 3: CLI and Integrations
**Files:**
- Create `src/index.ts`, `src/cli/*`, `src/gmail/*`, `src/llm/*`, `src/run/*`, `scripts/smoke.mjs`
- [x] Implement `nlc init`, `nlc run`, help text, date flag validation, dry run, enrichment-only, and fixture-backed smoke execution.
- [x] Add OAuth/browser-flow boundaries for Gmail and Sheets without requiring live credentials during automated tests.
- [x] Add provider adapters for Anthropic, OpenAI, local endpoints, and OpenAI-compatible endpoints.
### Task 4: Docs and Build
**Files:**
- Create `README.md`, `config.example.yaml`
- [x] Document setup, OAuth files, token persistence, Google Sheets and Excel output, binary build tooling, and local smoke flow.
- [x] Configure `npm run build` to compile and bundle the CLI.
### Task 5: Validation
**Commands:**
- [ ] `npm install`
- [ ] `npm run lint`
- [ ] `npm run format:check`
- [ ] `npm run typecheck`
- [ ] `npm test`
- [ ] `npm run build`
- [ ] `npm run smoke`
**Self-review:** The plan maps every SPEC.md subsystem to a module or integration boundary. Live Gmail, Sheets, and LLM validation require user credentials, so local validation uses fakes and fixtures while docs describe operational setup.
+158
View File
@@ -0,0 +1,158 @@
/goal
<task>
You are an autonomous senior engineer working in:
C:\Users\ksolo\Projects\Misc Projects\Newletter Link Catalog
Implement the Newsletter Link Catalog CLI described in SPEC.md end-to-end.
The expected product is a TypeScript/Node.js CLI named `nlc` with:
- `nlc init`
- `nlc run [flags]`
- Gmail OAuth browser auth and local token persistence
- config-driven Gmail label/folder processing
- HTML newsletter parsing and link extraction
- noise filtering, tracking URL cleanup, redirect unwrapping, read-more merging
- hybrid categorization using section headers, rules, and optional LLM providers
- parser plugin architecture with a generic parser and Substack plugin
- Google Sheets and local `.xlsx` outputs
- incremental state tracking in JSON
- enrichment pass for page title/meta, dead-link handling, paywall/unreachable markers
- dry-run, date filtering, full reprocess, skip-enrich, enrich-only, config, and verbose flags
- standalone binary build script and documentation for the selected bundling tool
Use SPEC.md as the source of truth. If existing code conflicts with SPEC.md, prefer SPEC.md unless repo-local instructions explicitly require otherwise.
The repository-level working agreements mention PHP tooling, but this project spec is TypeScript/Node.js. Apply the relevant JS quality rules: ESLint airbnb/base, Prettier, tests, secure input/output
handling, and CI-style validation. Do not add PHP tooling unless PHP files already exist and require it.
</task>
<goal>
Build a production-quality CLI that meets the SPEC.md requirements, adheres to the working agreements, and can be used by the repository owner to catalog newsletter links from Gmail into Google Sheets with confidence in correctness, safety, and maintainability.
</goal>
<default_follow_through_policy>
Default to the most reasonable low-risk interpretation and keep going.
Only stop to ask when a missing detail changes correctness, safety, external credentials, or an irreversible action.
When external services or credentials are unavailable, implement the integration boundary, tests, mocks, and clear setup docs instead of blocking.
</default_follow_through_policy>
<completeness_contract>
Persist until the task is fully handled end-to-end within the current turn whenever feasible: do not stop at analysis or partial fixes.
Treat the task as incomplete until every major SPEC.md behavior is implemented, tested, documented, or explicitly marked [blocked] with evidence.
Before finishing, reconcile every plan item: Done, Blocked, or Cancelled. Never leave items in-progress.
Do not claim completion until validation has run and failures are fixed or explained with concrete blocker evidence.
</completeness_contract>
<missing_context_gating>
Read SPEC.md and inspect the repository before planning implementation.
Do not guess repository structure, package manager, test framework, or existing scripts. Retrieve them with tools.
If the repo is empty or nearly empty, scaffold a TypeScript CLI project using npm unless an existing package manager is present.
If credentials, live Gmail, Google Sheets, or LLM API keys are missing, use mocks/fakes for automated tests and document the required environment variables and setup.
</missing_context_gating>
<tool_persistence_rules>
Prefer dedicated tools over raw shell where available: rg, read_file/list_dir equivalents, apply_patch, and update_plan.
Use rg or rg --files for search.
Parallelize independent file reads; sequence dependent actions.
Use apply_patch for manual edits.
Keep using tools until you have enough evidence to finish confidently.
</tool_persistence_rules>
<implementation_requirements>
Implement a clean modular architecture, with separate modules for:
- CLI command parsing
- config loading and validation
- Gmail OAuth/auth/client access
- Gmail message fetching by configured label
- HTML parsing and extraction
- noise filtering
- URL normalization, redirect unwrapping, and tracking parameter stripping
- categorization
- LLM provider adapters
- parser plugins
- spreadsheet writers
- enrichment
- state persistence
- logging/progress reporting
Implement provider adapters for:
- Anthropic
- OpenAI
- local/Ollama or LM Studio style endpoints
- OpenAI-compatible endpoints
Implement output writers for:
- Google Sheets API
- local Excel `.xlsx`
Implement tests for core behavior without requiring live external services:
- config validation
- date filter conflict handling
- sheet-name sanitization/truncation
- URL cleanup and tracking parameter stripping
- read-more link merging
- noise filtering
- sponsor detection
- section-header categorization
- fallback rule categorization
- state-file incremental behavior
- dead/paywall/unreachable enrichment handling
- dry-run state/write suppression
- parser plugin selection, including Substack
</implementation_requirements>
<action_safety>
Keep changes tightly scoped to building this CLI.
Avoid unrelated refactors, renames, or cleanup.
Do not run destructive git commands such as reset --hard or checkout -- without explicit approval.
Never commit secrets, tokens, OAuth credentials, spreadsheet IDs, or user data.
Persist tokens only in documented local paths such as ~/.nlc.
Sanitize config and CLI inputs. Escape or safely encode spreadsheet cell values that could become formulas.
Handle critical errors by stopping with a useful message. Handle individual email/link failures by logging and continuing.
</action_safety>
<verification_loop>
Required validations:
- npm install
- npm run lint
- npm run format:check
- npm run typecheck
- npm test
- npm run build
- npm run smoke
If the package scripts do not exist yet, create them.
`npm run build` must compile the TypeScript project and produce the standalone binary or packaged executable artifact described in the docs.
`npm run smoke` must exercise the CLI without live credentials, at minimum:
- `nlc --help`
- `nlc init --help`
- `nlc run --help`
- a dry-run or fixture-backed run path that proves parsing/output orchestration works without mutating real Gmail or Sheets.
Before finalizing, run the required validations.
If a check fails, fix the cause and rerun until green or until a real external blocker remains.
Report any unavailable live-service validation separately from local automated validation.
</verification_loop>
<progress_updates>
For long work, give brief progress updates after meaningful milestones:
- repo inspection complete
- implementation plan formed
- core modules scaffolded
- tests added
- validations running
- final validation result
Keep updates concise and outcome-based.
</progress_updates>
<structured_output_contract>
Final report exactly in this order:
1. Summary: 2-4 bullets describing what was built.
2. Changed files: one line per important file or directory.
3. Validations: each command run and its result.
4. Blockers or residual risks: include only real remaining issues.
5. Next operational steps: credential/setup steps needed for live Gmail or Google Sheets use.
Keep the final report compact and highest-signal first.
</structured_output_contract>
+378
View File
@@ -0,0 +1,378 @@
# Newsletter Link Catalog — Specification
## Overview
A CLI tool that extracts links from newsletters in a designated Gmail folder, categorizes them, enriches them with metadata, and compiles them into a spreadsheet. Each newsletter gets its own sheet, links are organized by issue date and category, and sponsor links are tracked separately.
## Architecture
### Language & Runtime
- **TypeScript/Node.js** — compiled to a standalone binary by the project build script
- CLI tool invoked as `nlc run [flags]`
### Distribution
- Standalone binary — no Node runtime required on the host machine
- Built and packaged via CI or build script
- The build script must document the selected bundling tool and produce the binary from a clean checkout
### Run Modes
- **Manual**: Run `nlc run` on demand with optional date filters
- **Scheduled**: Can be run via cron/Task Scheduler for recurring processing
- Designed for both; no daemon mode required
## Gmail Integration
### Authentication
- **OAuth2 browser flow** — user authorizes via browser, tokens persisted locally
- `nlc init` command walks through OAuth setup interactively
### Scope
- Processes emails from a **single designated Gmail folder/label** (configured in `config.yaml`)
- Does not scan the entire inbox or search by sender patterns
### Email Processing
- **HTML only** — plain-text parts are ignored
- **Image-only emails** (single image, no extractable links) are skipped with a warning logged
- **"View in browser" emails** — if the email contains no content links after noise filtering and contains a mirror link with anchor text matching `view in browser`, `view online`, or `read online`, fetch that mirror URL and extract links from the fetched HTML instead
- Incremental by default: tracks processed Message-IDs in a local state file, only processes new emails
- `--full` flag forces reprocessing of all emails that match the configured label and any date filters
## Link Extraction & Processing
### Extraction Pipeline
1. Fetch emails from the configured Gmail folder (incremental or full)
2. Parse HTML to extract links, section headers, and surrounding text. A section header is the nearest preceding heading-like element (`h1`-`h6`, table row header, or bold standalone line) within the same content block.
3. Filter out noise links: unsubscribe, social footer icons, "share this newsletter" links
4. Unwrap supported tracking redirects and strip configured tracking query parameters — store the normalized destination URL
5. Merge "Read more" links with their preceding content (detected by: consecutive links with the same normalized URL and anchor text matching the configured read-more pattern)
6. Categorize each link (see Categorization section)
7. Write to spreadsheet (see Output section)
### Noise Filtering
The following link types are **excluded** from content sheets:
- Unsubscribe links
- Social media links in footer or sharing blocks
- Links whose anchor text or accessible label matches configured share/forward patterns
- "View in browser" mirror links (content is extracted from the web version instead)
Sponsor/ad links are **not filtered** — they go to a separate sheet when the link is inside a block labeled with configured sponsor markers such as "sponsor", "sponsored", "ad", "advertisement", or "partner".
### URL Handling
- Unwrap HTTP redirects and supported provider redirect URLs up to the configured redirect limit
- Strip configured tracking query parameters, including `utm_*`, `fbclid`, `gclid`, `mc_cid`, `mc_eid`, and provider-specific tracking parameters listed in config
- Store the normalized destination URL after redirect unwrapping and query cleanup
- Dead/broken links (4xx/5xx during enrichment) are written to the "Dead Links" sheet and removed from content sheets when they were already written by an earlier phase or run
### "Read More" Merging
When two consecutive extracted links point to the same normalized URL and one anchor text matches the configured read-more pattern, they are merged into a single entry combining the preceding link title/description with the read-more link URL.
## Categorization
### Strategy: Hybrid
1. **Primary**: Use the newsletter's own section headers (e.g., "Python", "DevOps", "Career") as categories
2. **Fallback**: When section headers aren't available or don't cover a link, use rule-based classification (URL patterns + keywords)
3. **Final fallback**: LLM-based categorization when rules don't match
### Category Taxonomy
- Built-in base taxonomy shipped with the tool for common dev categories (Python, JavaScript, DevOps, Security, etc.)
- User can extend via config with custom categories
- For fallback categorization, the LLM is instructed to prefer configured categories and may create a new category only when no existing category fits
### LLM Provider Support (BYOK)
The tool supports a provider adapter interface and ships adapters for:
- **Claude/Anthropic** — Anthropic API
- **OpenAI/GPT** — OpenAI API
- **Local models** — Ollama, LM Studio
- **OpenAI-compatible endpoints** — Mistral, Groq, Together, etc.
Provider config includes: API key environment variable, base URL when required, model name, and optional provider parameters.
### Newsletter Parsing: Plugin System
- Generic HTML parser as the default
- Platform-specific parsers loaded as plugins (detected by URL patterns or email headers)
- **Substack** shipped as the first plugin — maps Substack-specific HTML structures to the common extracted-link format
- Additional parsers can be added as plugins without modifying core logic
## Output: Spreadsheet
### Supported Formats
- **Google Sheets** — via Google Sheets API (live, shareable, updated by each write run)
- **Local Excel (.xlsx)** — written to disk, can be uploaded manually
Config selects which output(s) to use; both can be active simultaneously.
### Spreadsheet Name
- Fixed name set in `config.yaml` (e.g., "Newsletter Link Catalog")
### Sheet Naming
- Each newsletter gets its own sheet named after the parsed display name from the email's From header
- Names truncated to fit Google Sheets' 100-character limit
- Characters invalid for Google Sheets or Excel sheet names are replaced with spaces, then repeated whitespace is collapsed
### Content Sheet Columns
Every link occurrence is written as a flat row; blank grouping rows are not used. Fields unavailable from the source are written as empty cells.
| Column | Description |
|---|---|
| Issue Date | Date from email's Date header (overridable per-newsletter) |
| Category | Assigned category (from newsletter sections, rules, or LLM) |
| Link URL | Clean canonical URL after unwrapping and UTM removal |
| Title | Anchor text / headline from the newsletter |
| Description | 1-2 sentence description from the newsletter (if present) |
| Page Title + Meta | `<title>` and meta description from the destination page (enrichment phase) |
| Source Newsletter | Name of the newsletter this link came from |
| Also In | Cross-reference: other newsletters that also mentioned this link |
### Sponsor Sheet (Consolidated)
Single sheet named "Sponsored Links" containing sponsor/ad links from all newsletters:
| Column | Description |
|---|
| Newsletter | Which newsletter this sponsor link appeared in |
| Sponsor | Sponsor name (parsed from newsletter) |
| Link | Sponsor's link URL |
| Description | Sponsor description from the newsletter |
### Dead Links Sheet
Single sheet named "Dead Links" for links that returned errors during enrichment:
| Column | Description |
|---|
| URL | The clean canonical URL |
| Status | HTTP status or error type (404, 403, timeout, etc.) |
| Source | Newsletter name |
| Date | Issue date |
### Cross-References
- Duplicates across newsletters are kept in their respective sheets (all occurrences preserved)
- The **Also In** column annotates each row with other newsletter issues that mentioned the same normalized URL, formatted as `Newsletter Name (YYYY-MM-DD)` and joined with `; `
- This enables finding cross-newsletter coverage without a separate consolidated sheet
### No "All Links" Master Sheet
Only per-newsletter content sheets, plus the consolidated Sponsor and Dead Links sheets. No "All Links" aggregation sheet.
## Enrichment
### Two-Phase Approach
1. **Phase 1 (Store)**: Extract links from newsletters, categorize, and write to spreadsheet with all available in-newsletter metadata
2. **Phase 2 (Enrich)**: Separate pass to fetch each link's destination page for `<title>` and meta description
Enrichment can be run independently from extraction and spreadsheet writing.
### Enrichment Details
- Configurable concurrency with defaults of 3 parallel requests and 1500 ms delay between batches
- Retries on transient failures
- Dead links (4xx/5xx) are written to the Dead Links sheet and removed from content sheets when they were already written by an earlier phase or run
- Skip pages that redirect to a URL whose path or query contains `login`, `signin`, `subscribe`, or `paywall` — mark with "paywall" status
- Progress bar updates after each completed enrichment request
### Link Liveness
- Dead links are **not included** in content sheets — they go to the Dead Links sheet
- Paywalled links are included in content sheets and the Page Title + Meta column is set to `[paywall]`
- Timeout, DNS, TLS, and network failures are included in content sheets and the Page Title + Meta column is set to `[unreachable: error_type]`
## Processing Model
### Incremental Processing
- Local state file (JSON) tracks processed Message-IDs and enrichment status
- On subsequent runs, only new/unprocessed emails are fetched
- `--full` flag forces reprocessing of all emails that match the configured label and any date filters
- State file location: `~/.nlc/state.json` (or configured path)
### Date Filtering
- `--from YYYY-MM-DD` and `--to YYYY-MM-DD` — absolute date range
- `--last N` (e.g., `--last 30d`, `--last 7d`) — relative date range
- Date filters apply before the incremental processed-message check
- If both `--last` and `--from`/`--to` are provided, the CLI exits with a config error
### Dry Run
- `--dry-run` processes the most recent N emails (default: 5) without writing to the spreadsheet
- Shows what would be extracted, categorized, and written
- Dry run does not update the state file or call destination pages for enrichment unless `--dry-run` is combined with `--enrich-only`
### Error Handling
- **Critical errors** (Gmail auth failure, spreadsheet write failure, config errors) → stop execution
- **Individual errors** (one link fails to enrich, one email fails to parse) → log and continue
- Summary at end includes error counts and details
### Progress & Logging
- Progress bar during processing (emails fetched, links extracted, enrichment status)
- Summary stats at the end: newsletters processed, links extracted, duplicates found, dead links, sponsors, errors
## CLI Interface
### Commands
```
nlc init # Interactive setup: OAuth, config file, connectivity test
nlc run [flags] # Main processing command
```
### `nlc run` Flags
| Flag | Description | Default |
|---|---|---|
| `--full` | Reprocess all emails, not just new ones | false |
| `--dry-run [N]` | Process most recent N emails without writing to sheet | 5 |
| `--from YYYY-MM-DD` | Process emails from this date | (none) |
| `--to YYYY-MM-DD` | Process emails up to this date | (none) |
| `--last N` | Process emails from last N days (e.g., `--last 30d`) | (none) |
| `--skip-enrich` | Skip the enrichment phase (only extract + categorize) | false |
| `--enrich-only` | Only run enrichment on already-extracted links | false |
| `--config PATH` | Path to config file | `./config.yaml` |
| `--verbose` | Detailed per-email and per-link output | false |
## Configuration
### File Format: YAML
Location: `./config.yaml` (overridable with `--config`)
### Sample Structure
```yaml
# Gmail settings
gmail:
folder: "Newsletters" # Gmail label/folder to process
credentials: "~/.nlc/gmail-credentials.json"
token: "~/.nlc/gmail-token.json"
# Output settings
output:
name: "Newsletter Link Catalog" # Spreadsheet name
sheets_api:
enabled: true
credentials: "~/.nlc/sheets-credentials.json"
token: "~/.nlc/sheets-token.json"
excel:
enabled: true
path: "./output/newsletter-catalog.xlsx"
# Newsletter identification
newsletters:
# Manual overrides for parsed display names
"alex@bytebytego.com":
display_name: "ByteByteGo"
"dan@techtakesweekly.com":
display_name: "Tech Takes Weekly"
# Link processing
links:
unwrap_redirects: true
strip_utm: true
tracking_params:
- "utm_*"
- "fbclid"
- "gclid"
- "mc_cid"
- "mc_eid"
redirect_limit: 5
read_more_pattern: "(?i)^(read more|continue reading|learn more)$"
share_patterns:
- "(?i)share"
- "(?i)forward to a friend"
sponsor_markers:
- "(?i)sponsor"
- "(?i)sponsored"
- "(?i)advertisement"
- "(?i)partner"
filter_unsubscribe: true
filter_social_footer: true
filter_share_links: true
merge_read_more: true
# Categorization
categories:
# Built-in taxonomy is used by default; extend here
custom:
- "AI/ML"
- "Career"
- "Rust"
# LLM settings for category inference
llm:
provider: "anthropic" # anthropic | openai | local | openai-compatible
model: "claude-sonnet-4-6"
api_key_env: "ANTHROPIC_API_KEY"
base_url: null # for local/openai-compatible
failure_category: "Uncategorized"
# Enrichment
enrichment:
enabled: true
concurrency: 3
delay_ms: 1500
retries: 2
timeout_ms: 10000
# Rate limiting (applies to both Gmail API and enrichment)
rate_limit:
gmail_qps: 5 # queries per second to Gmail API
link_concurrency: 3 # parallel link fetches
# State
state_file: "~/.nlc/state.json"
# Parsing plugins
plugins:
substack:
enabled: true
```
### Issue Date Override
For newsletters where the email arrival date doesn't match the issue date, overrides can be configured:
```yaml
newsletters:
"sender@domain.com":
display_name: "Newsletter Name"
date_override: "subject" # Parse date from subject line
date_format: "%B %d, %Y" # Expected date format in subject
```
## Data Flow
```
┌─────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Gmail API │────▶│ Parse HTML │────▶│ Categorize │────▶│ Write Sheet │
│ (fetch) │ │ + Extract │ │ (hybrid) │ │ (Phase 1) │
└─────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
│ │
▼ ▼
┌──────────────┐ ┌──────────────┐
│ State File │ │ Enrichment │
│ (processed │ │ (Phase 2) │
│ tracking) │ │ Page titles │
└──────────────┘ └──────────────┘
```
## Edge Cases
| Scenario | Behavior |
|---|---|
| Email is a single image with no links | Skip with warning, log to state |
| "View in browser" link instead of content | Fetch the first matching mirror link, extract links from that HTML |
| Same link in multiple newsletters | Keep all occurrences, cross-reference via "Also In" column |
| Same link multiple times in one issue | Deduplicate per-issue; single row per unique URL |
| Link returns 4xx/5xx during enrichment | Move to Dead Links sheet |
| Link is paywalled/auth-required | Include in content sheet, mark Page Title + Meta as "[paywall]" |
| Link times out or has a network error | Include in content sheet, mark Page Title + Meta as "[unreachable: error_type]" |
| Newsletter name > 100 chars | Truncate for sheet name |
| Sheet already exists for newsletter | Append new rows, don't overwrite existing data |
| Gmail API rate limit | Retry with exponential backoff |
| OAuth token expired | Auto-refresh, re-prompt if refresh fails |
| Newsletter format changes | Parser falls back to generic HTML extraction |
## Setup & First Run
1. **`nlc init`** — Interactive walkthrough:
- Authenticate with Gmail (OAuth browser flow)
- Authenticate with Google Sheets (if using Sheets output)
- Select the Gmail folder/label to process
- Configure output location
- Test connectivity
- Generate `config.yaml`
2. **`nlc run --dry-run`** — Test with 5 most recent emails
3. **`nlc run`** — Full processing run
4. **`nlc run --enrich-only`** — Enrich previously extracted links with page titles
+368
View File
@@ -0,0 +1,368 @@
# Newsletter Link Catalog — Specification
## Overview
A CLI tool that extracts links from newsletters in a designated Gmail folder, categorizes them, enriches them with metadata, and compiles them into a spreadsheet. Each newsletter gets its own sheet, links are organized by issue date and category, and sponsor links are tracked separately.
## Architecture
### Language & Runtime
- **TypeScript/Node.js** — compiled to a standalone binary via `pkg` or `tsx-bundle`
- CLI tool invoked as `nlc run [flags]`
### Distribution
- Standalone binary — no Node runtime required on the host machine
- Built and packaged via CI or build script
### Run Modes
- **Manual**: Run `nlc run` on demand with optional date filters
- **Scheduled**: Can be run via cron/Task Scheduler for recurring processing
- Designed for both; no daemon mode required
## Gmail Integration
### Authentication
- **OAuth2 browser flow** — user authorizes via browser, tokens persisted locally
- `nlc init` command walks through OAuth setup interactively
### Scope
- Processes emails from a **single designated Gmail folder/label** (configured in `config.yaml`)
- Does not scan the entire inbox or search by sender patterns
### Email Processing
- **HTML only** — plain-text parts are ignored
- **Image-only emails** (single image, no extractable links) are skipped with a warning logged
- **"View in browser" emails** — fetches the web version's HTML and extracts links from that instead
- Incremental by default: tracks processed Message-IDs in a local state file, only processes new emails
- `--full` flag forces reprocessing of all emails
## Link Extraction & Processing
### Extraction Pipeline
1. Fetch emails from the configured Gmail folder (incremental or full)
2. Parse HTML to extract links, section headers, and surrounding text
3. Filter out noise links: unsubscribe, social footer icons, "share this newsletter" links
4. Unwrap tracking redirects and strip UTM parameters — store only the clean canonical URL
5. Merge "Read more" links with their preceding content (detected by: same URL + "read more" anchor text)
6. Categorize each link (see Categorization section)
7. Write to spreadsheet (see Output section)
### Noise Filtering
The following link types are **excluded** from content sheets:
- Unsubscribe links
- Social media footer links (Twitter, LinkedIn, etc.)
- "Share this newsletter" / "Forward to a friend" links
- "View in browser" mirror links (content is extracted from the web version instead)
Sponsor/ad links are **not filtered** — they go to a separate sheet.
### URL Handling
- Unwrap all tracking redirects (Mailchimp, Substack, etc.)
- Strip UTM parameters and other tracking query params
- Store only the clean canonical URL
- Dead/broken links (4xx/5xx during enrichment) are moved to a separate "Dead Links" sheet
### "Read More" Merging
When two consecutive elements point to the same URL and one has "read more" (or similar) anchor text, they are merged into a single entry combining the preceding description text and the link.
## Categorization
### Strategy: Hybrid
1. **Primary**: Use the newsletter's own section headers (e.g., "Python", "DevOps", "Career") as categories
2. **Fallback**: When section headers aren't available or don't cover a link, use rule-based classification (URL patterns + keywords)
3. **Final fallback**: LLM-based categorization when rules don't match
### Category Taxonomy
- **LLM-generated** by default — the model assigns categories based on link content
- Built-in base taxonomy shipped with the tool for common dev categories (Python, JavaScript, DevOps, Security, etc.)
- User can extend via config with custom categories
- LLM is instructed to prefer existing categories and only create new ones when nothing fits
### LLM Provider Support (BYOK)
All providers supported, configurable in `config.yaml`:
- **Claude/Anthropic** — Anthropic API
- **OpenAI/GPT** — OpenAI API
- **Local models** — Ollama, LM Studio
- **OpenAI-compatible endpoints** — Mistral, Groq, Together, etc.
Provider config includes: API key, base URL, model name, and optional parameters.
### Newsletter Parsing: Plugin System
- Generic HTML parser as the default
- Platform-specific parsers loaded as plugins (detected by URL patterns or email headers)
- **Substack** shipped as the first plugin — uses Substack's predictable HTML structure for more reliable extraction
- Additional parsers can be added as plugins without modifying core logic
## Output: Spreadsheet
### Supported Formats
- **Google Sheets** — via Google Sheets API (live, shareable, auto-updated)
- **Local Excel (.xlsx)** — written to disk, can be uploaded manually
Config selects which output(s) to use; both can be active simultaneously.
### Spreadsheet Name
- Fixed name set in `config.yaml` (e.g., "Newsletter Link Catalog")
### Sheet Naming
- Each newsletter gets its own sheet named after the parsed display name from the email's From header
- Names truncated to fit Google Sheets' 100-character limit
- Special characters replaced as needed for sheet name validity
### Content Sheet Columns
Every row is fully populated (flat table — no blank cells for grouping):
| Column | Description |
|---|---|
| Issue Date | Date from email's Date header (overridable per-newsletter) |
| Category | Assigned category (from newsletter sections, rules, or LLM) |
| Link URL | Clean canonical URL after unwrapping and UTM removal |
| Title | Anchor text / headline from the newsletter |
| Description | 1-2 sentence description from the newsletter (if present) |
| Page Title + Meta | `<title>` and meta description from the destination page (enrichment phase) |
| Source Newsletter | Name of the newsletter this link came from |
| Also In | Cross-reference: other newsletters that also mentioned this link |
### Sponsor Sheet (Consolidated)
Single sheet named "Sponsored Links" containing sponsor/ad links from all newsletters:
| Column | Description |
|---|
| Newsletter | Which newsletter this sponsor link appeared in |
| Sponsor | Sponsor name (parsed from newsletter) |
| Link | Sponsor's link URL |
| Description | Sponsor description from the newsletter |
### Dead Links Sheet
Single sheet named "Dead Links" for links that returned errors during enrichment:
| Column | Description |
|---|
| URL | The clean canonical URL |
| Status | HTTP status or error type (404, 403, timeout, etc.) |
| Source | Newsletter name |
| Date | Issue date |
### Cross-References
- Duplicates across newsletters are kept in their respective sheets (all occurrences preserved)
- The **Also In** column annotates each row with which other newsletters mentioned the same link and when (e.g., "TLDR Web Dev (Mar 5)")
- This enables finding cross-newsletter coverage without a separate consolidated sheet
### No "All Links" Master Sheet
Only per-newsletter content sheets, plus the consolidated Sponsor and Dead Links sheets. No "All Links" aggregation sheet.
## Enrichment
### Two-Phase Approach
1. **Phase 1 (Store)**: Extract links from newsletters, categorize, and write to spreadsheet with all available in-newsletter metadata
2. **Phase 2 (Enrich)**: Separate pass to fetch each link's destination page for `<title>` and meta description
This keeps the initial run fast and allows enrichment to be run independently.
### Enrichment Details
- Configurable concurrency (safe defaults: 3-5 parallel, 1-2s delay between batches)
- Retries on transient failures
- Dead links (4xx/5xx) moved to Dead Links sheet
- Skip paywalled/auth-required pages (detected by login redirects) — mark with "paywall" status
- Progress bar shows enrichment status in real-time
### Link Liveness
- Dead links are **not included** in content sheets — they go to the Dead Links sheet
- Paywalled/unreachable links are included in content sheets but flagged in the Page Title + Meta column
## Processing Model
### Incremental Processing
- Local state file (JSON) tracks processed Message-IDs and enrichment status
- On subsequent runs, only new/unprocessed emails are fetched
- `--full` flag forces reprocessing of all emails
- State file location: `~/.nlc/state.json` (or configured path)
### Date Filtering
- `--from YYYY-MM-DD` and `--to YYYY-MM-DD` — absolute date range
- `--last N` (e.g., `--last 30d`, `--last 7d`) — relative date range
- Can be combined with incremental processing
### Dry Run
- `--dry-run` processes the most recent X emails (default: 5) without writing to the spreadsheet
- Shows what would be extracted, categorized, and written
- Useful for testing config changes and parser tweaks
### Error Handling
- **Critical errors** (Gmail auth failure, spreadsheet write failure, config errors) → stop execution
- **Individual errors** (one link fails to enrich, one email fails to parse) → log and continue
- Summary at end includes error counts and details
### Progress & Logging
- Progress bar during processing (emails fetched, links extracted, enrichment status)
- Summary stats at the end: newsletters processed, links extracted, duplicates found, dead links, sponsors, errors
## CLI Interface
### Commands
```
nlc init # Interactive setup: OAuth, config file, connectivity test
nlc run [flags] # Main processing command
```
### `nlc run` Flags
| Flag | Description | Default |
|---|---|---|
| `--full` | Reprocess all emails, not just new ones | false |
| `--dry-run [N]` | Process most recent N emails without writing to sheet | 5 |
| `--from YYYY-MM-DD` | Process emails from this date | (none) |
| `--to YYYY-MM-DD` | Process emails up to this date | (none) |
| `--last N` | Process emails from last N days (e.g., `--last 30d`) | (none) |
| `--skip-enrich` | Skip the enrichment phase (only extract + categorize) | false |
| `--enrich-only` | Only run enrichment on already-extracted links | false |
| `--config PATH` | Path to config file | `./config.yaml` |
| `--verbose` | Detailed per-email and per-link output | false |
## Configuration
### File Format: YAML
Location: `./config.yaml` (overridable with `--config`)
### Sample Structure
```yaml
# Gmail settings
gmail:
folder: "Newsletters" # Gmail label/folder to process
credentials: "~/.nlc/gmail-credentials.json"
token: "~/.nlc/gmail-token.json"
# Output settings
output:
name: "Newsletter Link Catalog" # Spreadsheet name
sheets_api:
enabled: true
credentials: "~/.nlc/sheets-credentials.json"
token: "~/.nlc/sheets-token.json"
excel:
enabled: true
path: "./output/newsletter-catalog.xlsx"
# Newsletter identification
newsletters:
# Manual overrides for parsed display names
# sender_pattern: "display_name"
"alex@bytebytego.com": "ByteByteGo"
"dan@techtakesweekly.com": "Tech Takes Weekly"
# Link processing
links:
unwrap_redirects: true
strip_utm: true
filter_unsubscribe: true
filter_social_footer: true
filter_share_links: true
merge_read_more: true
# Categorization
categories:
# Built-in taxonomy is used by default; extend here
custom:
- "AI/ML"
- "Career"
- "Rust"
# LLM settings for category inference
llm:
provider: "anthropic" # anthropic | openai | local | openai-compatible
model: "claude-sonnet-4-6"
api_key_env: "ANTHROPIC_API_KEY" # or set in env
base_url: null # for local/openai-compatible
fallback_to_rules: true # if LLM fails, use rule-based
# Enrichment
enrichment:
enabled: true
concurrency: 3
delay_ms: 1500
retries: 2
timeout_ms: 10000
# Rate limiting (applies to both Gmail API and enrichment)
rate_limit:
gmail_qps: 5 # queries per second to Gmail API
link_concurrency: 3 # parallel link fetches
# State
state_file: "~/.nlc/state.json"
# Parsing plugins
plugins:
substack:
enabled: true
```
### Issue Date Override
For newsletters where the email arrival date doesn't match the issue date, overrides can be configured:
```yaml
newsletters:
"sender@domain.com":
display_name: "Newsletter Name"
date_override: "subject" # Parse date from subject line
date_format: "%B %d, %Y" # Expected date format in subject
```
## Data Flow
```
┌─────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Gmail API │────▶│ Parse HTML │────▶│ Categorize │────▶│ Write Sheet │
│ (fetch) │ │ + Extract │ │ (hybrid) │ │ (Phase 1) │
└─────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
│ │
▼ ▼
┌──────────────┐ ┌──────────────┐
│ State File │ │ Enrichment │
│ (processed │ │ (Phase 2) │
│ tracking) │ │ Page titles │
└──────────────┘ └──────────────┘
```
## Edge Cases
| Scenario | Behavior |
|---|---|
| Email is a single image with no links | Skip with warning, log to state |
| "View in browser" link instead of content | Fetch web version HTML, extract links from that |
| Same link in multiple newsletters | Keep all occurrences, cross-reference via "Also In" column |
| Same link multiple times in one issue | Deduplicate per-issue; single row per unique URL |
| Link returns 4xx/5xx during enrichment | Move to Dead Links sheet |
| Link is paywalled/auth-required | Include in content sheet, mark Page Title as "[paywall]" |
| Newsletter name > 100 chars | Truncate for sheet name |
| Sheet already exists for newsletter | Append new rows, don't overwrite existing data |
| Gmail API rate limit | Retry with exponential backoff |
| OAuth token expired | Auto-refresh, re-prompt if refresh fails |
| Newsletter format changes | Parser falls back to generic HTML extraction |
## Setup & First Run
1. **`nlc init`** — Interactive walkthrough:
- Authenticate with Gmail (OAuth browser flow)
- Authenticate with Google Sheets (if using Sheets output)
- Select the Gmail folder/label to process
- Configure output location
- Test connectivity
- Generate `config.yaml`
2. **`nlc run --dry-run`** — Test with 5 most recent emails
3. **`nlc run`** — Full processing run
4. **`nlc run --enrich-only`** — Enrich previously extracted links with page titles
## Future Considerations
These are **not** in scope for v1 but noted for potential future work:
- Search/filter functionality within the spreadsheet
- Web UI for browsing the catalog
- Email forwarding as an alternative to Gmail API access
- Automatic category taxonomy refinement based on accumulated data
- Plugin system for additional newsletter platforms beyond Substack
- Notification on new newsletter processing
+9111
View File
File diff suppressed because it is too large Load Diff
+53
View File
@@ -0,0 +1,53 @@
{
"name": "newsletter-link-catalog",
"version": "0.1.0",
"description": "CLI for cataloging newsletter links from Gmail into spreadsheets.",
"type": "module",
"bin": {
"nlc": "./dist/index.js"
},
"scripts": {
"build": "tsup && node scripts/make-executable.mjs && node scripts/package-binary.mjs",
"dev": "tsx src/index.ts",
"lint": "eslint . --ext .ts,.js",
"format": "prettier --write .",
"format:check": "prettier --check .",
"typecheck": "tsc --noEmit",
"test": "vitest run",
"smoke": "node scripts/smoke.mjs"
},
"keywords": [
"newsletter",
"gmail",
"sheets",
"cli"
],
"author": "",
"license": "MIT",
"dependencies": {
"@commander-js/extra-typings": "^12.1.0",
"cheerio": "^1.0.0",
"commander": "^12.1.0",
"googleapis": "^140.0.1",
"open": "^10.1.0",
"ora": "^8.1.1",
"xlsx": "^0.18.5",
"yaml": "^2.5.1",
"zod": "^3.23.8"
},
"devDependencies": {
"@types/node": "^22.9.0",
"@typescript-eslint/eslint-plugin": "^8.11.0",
"@typescript-eslint/parser": "^8.11.0",
"@yao-pkg/pkg": "^6.19.0",
"eslint": "^8.57.1",
"eslint-config-airbnb-base": "^15.0.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-import": "^2.31.0",
"prettier": "^3.3.3",
"tsup": "^8.3.5",
"tsx": "^4.19.2",
"typescript": "^5.6.3",
"vitest": "^2.1.4"
}
}
+3
View File
@@ -0,0 +1,3 @@
import { chmod } from 'node:fs/promises';
await chmod('dist/index.js', 0o755).catch(() => undefined);
+18
View File
@@ -0,0 +1,18 @@
import { execFile } from 'node:child_process';
import { platform } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
const exec = promisify(execFile);
const isWindows = platform() === 'win32';
const pkg = join(process.cwd(), 'node_modules', '@yao-pkg', 'pkg', 'lib-es5', 'bin.js');
const target = isWindows
? 'node22-win-x64'
: platform() === 'darwin'
? 'node22-macos-x64'
: 'node22-linux-x64';
const output = join('dist', isWindows ? 'nlc.exe' : 'nlc');
await exec(process.execPath, [pkg, 'dist/index.js', '--targets', target, '--output', output], {
cwd: process.cwd()
});
+37
View File
@@ -0,0 +1,37 @@
import { execFile } from 'node:child_process';
import { mkdtemp, writeFile, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
const exec = promisify(execFile);
const cli = join(process.cwd(), 'dist', 'index.js');
const binary = join(process.cwd(), 'dist', process.platform === 'win32' ? 'nlc.exe' : 'nlc');
const dir = await mkdtemp(join(tmpdir(), 'nlc-smoke-'));
try {
const config = join(dir, 'config.yaml');
await writeFile(
config,
`gmail:
folder: Newsletters
output:
name: Smoke Catalog
excel:
enabled: true
path: ${JSON.stringify(join(dir, 'catalog.xlsx'))}
state_file: ${JSON.stringify(join(dir, 'state.json'))}
`
);
await exec('node', [cli, '--help']);
await exec(binary, ['--help']);
await exec('node', [cli, 'init', '--help']);
await exec('node', [cli, 'run', '--help']);
await exec('node', [cli, 'run', '--config', config, '--dry-run'], {
env: { ...process.env, NLC_FIXTURE: '1' }
});
console.log('Smoke checks passed');
} finally {
await rm(dir, { force: true, recursive: true });
}
+52
View File
@@ -0,0 +1,52 @@
import { ExtractedLink } from '../parsing/types.js';
export interface CategoryProvider {
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
}
const builtIn = [
'Python',
'JavaScript',
'DevOps',
'Security',
'AI/ML',
'Career',
'Rust',
'Uncategorized'
];
const rules: Array<[RegExp, string]> = [
[/python|django|flask/i, 'Python'],
[/javascript|typescript|node|react/i, 'JavaScript'],
[/kubernetes|k8s|docker|devops|terraform/i, 'DevOps'],
[/security|vulnerability|cve/i, 'Security'],
[/ai|llm|machine learning|ml/i, 'AI/ML'],
[/career|interview|hiring/i, 'Career'],
[/rust|cargo/i, 'Rust']
];
export class Categorizer {
private readonly categories: string[];
public constructor(
categories: string[] = [],
private readonly provider?: CategoryProvider,
private readonly failureCategory = 'Uncategorized'
) {
this.categories = [...new Set([...builtIn, ...categories])];
}
public async categorize(link: ExtractedLink): Promise<string> {
if (link.section?.trim()) {
return link.section.trim();
}
const haystack = `${link.title} ${link.description ?? ''} ${link.url}`;
const matched = rules.find(([pattern]) => pattern.test(haystack));
if (matched) {
return matched[1];
}
return (await this.provider?.categorize(link, this.categories)) ?? this.failureCategory;
}
}
+21
View File
@@ -0,0 +1,21 @@
export interface DateFlags {
from?: string;
to?: string;
last?: string;
}
export function validateDateFilters(flags: DateFlags): void {
if (flags.last && (flags.from || flags.to)) {
throw new Error('--last cannot be combined with --from or --to');
}
for (const [name, value] of Object.entries({ from: flags.from, to: flags.to })) {
if (value && !/^\d{4}-\d{2}-\d{2}$/.test(value)) {
throw new Error(`--${name} must use YYYY-MM-DD`);
}
}
if (flags.last && !/^\d+d$/.test(flags.last)) {
throw new Error('--last must look like 30d');
}
}
+86
View File
@@ -0,0 +1,86 @@
import { Command, Option } from 'commander';
import { writeFile } from 'node:fs/promises';
import { loadConfig } from '../config/config.js';
import { ExcelWriter } from '../output/excel.js';
import { runCatalog } from '../run/runCatalog.js';
import { validateDateFilters } from './flags.js';
const sampleConfig = `gmail:
folder: Newsletters
output:
name: Newsletter Link Catalog
excel:
enabled: true
path: ./output/newsletter-catalog.xlsx
`;
export function createProgram(): Command {
const program = new Command();
program.name('nlc').description('Newsletter Link Catalog').version('0.1.0');
program
.command('init')
.description('Create a starter config and document OAuth credential paths')
.option('--config <path>', 'Path to write config', './config.yaml')
.action(async (options) => {
await writeFile(options.config, sampleConfig, { flag: 'wx' }).catch(
async (error: NodeJS.ErrnoException) => {
if (error.code === 'EEXIST') {
throw new Error(`${options.config} already exists`);
}
throw error;
}
);
console.log(`Wrote ${options.config}. Add OAuth JSON files under ~/.nlc before live runs.`);
});
program
.command('run')
.description('Process configured Gmail newsletter folder')
.option('--full', 'Reprocess matching messages')
.addOption(
new Option('--dry-run [count]', 'Process without writing state or output')
.argParser((value) => Number(value))
.preset(5)
)
.option('--from <date>', 'Process from YYYY-MM-DD')
.option('--to <date>', 'Process to YYYY-MM-DD')
.option('--last <range>', 'Process last range such as 30d')
.option('--skip-enrich', 'Skip enrichment')
.option('--enrich-only', 'Only run enrichment on stored links')
.option('--config <path>', 'Config path', './config.yaml')
.option('--verbose', 'Verbose logging')
.action(async (options) => {
validateDateFilters(options);
const config = await loadConfig(options.config);
const writers = config.output.excel.enabled
? [new ExcelWriter(config.output.excel.path)]
: [];
const messages = process.env.NLC_FIXTURE === '1' ? fixtureMessages() : [];
const summary = await runCatalog({
config,
messages,
writers,
dryRun: options.dryRun,
full: options.full,
skipEnrich: options.skipEnrich,
enrichOnly: options.enrichOnly,
verbose: options.verbose
});
console.log(JSON.stringify(summary, null, 2));
});
return program;
}
function fixtureMessages() {
return [
{
id: 'fixture-1',
messageId: '<fixture-1>',
from: 'Fixture Weekly <fixture@example.com>',
date: new Date().toISOString(),
html: '<h2>JavaScript</h2><p><a href="https://example.com/post?utm_source=fixture">Fixture article</a></p>'
}
];
}
+129
View File
@@ -0,0 +1,129 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import YAML from 'yaml';
import { z } from 'zod';
const outputSchema = z.object({
name: z.string().min(1),
sheetsApi: z
.object({
enabled: z.boolean().default(false),
credentials: z.string().optional(),
token: z.string().optional(),
spreadsheetId: z.string().optional()
})
.optional(),
excel: z
.object({
enabled: z.boolean().default(false),
path: z.string().default('./output/newsletter-catalog.xlsx')
})
.optional()
});
const configSchema = z
.object({
gmail: z.object({
folder: z.string().min(1),
credentials: z.string().default('~/.nlc/gmail-credentials.json'),
token: z.string().default('~/.nlc/gmail-token.json')
}),
output: outputSchema,
newsletters: z.record(z.string(), z.any()).default({}),
links: z
.object({
unwrapRedirects: z.boolean().default(true),
stripUtm: z.boolean().default(true),
trackingParams: z
.array(z.string())
.default(['utm_*', 'fbclid', 'gclid', 'mc_cid', 'mc_eid']),
redirectLimit: z.number().int().positive().default(5),
readMorePattern: z.string().default('(?i)^(read more|continue reading|learn more)$'),
sharePatterns: z.array(z.string()).default(['(?i)share', '(?i)forward to a friend']),
sponsorMarkers: z
.array(z.string())
.default(['(?i)sponsor', '(?i)sponsored', '(?i)advertisement', '(?i)partner']),
filterUnsubscribe: z.boolean().default(true),
filterSocialFooter: z.boolean().default(true),
filterShareLinks: z.boolean().default(true),
mergeReadMore: z.boolean().default(true)
})
.default({}),
categories: z
.object({
custom: z.array(z.string()).default([]),
llm: z
.object({
provider: z
.enum(['anthropic', 'openai', 'local', 'openai-compatible'])
.default('anthropic'),
model: z.string().default('claude-sonnet-4-6'),
apiKeyEnv: z.string().default('ANTHROPIC_API_KEY'),
baseUrl: z.string().nullable().optional(),
failureCategory: z.string().default('Uncategorized')
})
.default({})
})
.default({}),
enrichment: z
.object({
enabled: z.boolean().default(true),
concurrency: z.number().int().positive().default(3),
delayMs: z.number().int().nonnegative().default(1500),
retries: z.number().int().nonnegative().default(2),
timeoutMs: z.number().int().positive().default(10000)
})
.default({}),
rateLimit: z
.object({
gmailQps: z.number().positive().default(5),
linkConcurrency: z.number().int().positive().default(3)
})
.default({}),
stateFile: z.string().default('~/.nlc/state.json'),
plugins: z.record(z.string(), z.any()).default({})
})
.transform((config) => ({
...config,
output: {
...config.output,
sheetsApi: config.output.sheetsApi ?? { enabled: false },
excel: config.output.excel ?? { enabled: false, path: './output/newsletter-catalog.xlsx' }
}
}));
export type AppConfig = z.infer<typeof configSchema>;
export type PartialConfig = Record<string, unknown>;
function camelize(value: unknown): unknown {
if (Array.isArray(value)) {
return value.map(camelize);
}
if (value && typeof value === 'object') {
return Object.fromEntries(
Object.entries(value as Record<string, unknown>).map(([key, entry]) => [
key.replace(/_([a-z])/g, (_, letter: string) => letter.toUpperCase()),
camelize(entry)
])
);
}
return value;
}
export function expandHome(path: string): string {
return path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export function loadConfigFromString(source: string): AppConfig {
const parsed = camelize(YAML.parse(source) ?? {});
return configSchema.parse(parsed);
}
export async function loadConfig(path: string): Promise<AppConfig> {
return loadConfigFromString(await readFile(expandHome(path), 'utf8'));
}
export function normalizeConfig(config: PartialConfig): AppConfig {
return configSchema.parse(camelize(config));
}
+34
View File
@@ -0,0 +1,34 @@
import * as cheerio from 'cheerio';
export type FetchPage = (
url: string
) => Promise<{ status: number; finalUrl: string; html: string }>;
export type EnrichmentResult =
| { status: 'ok'; titleMeta: string }
| { status: 'dead'; error: string }
| { status: 'paywall'; titleMeta: '[paywall]' }
| { status: 'unreachable'; titleMeta: string; error: string };
export async function enrichLink(url: string, fetchPage: FetchPage): Promise<EnrichmentResult> {
try {
const response = await fetchPage(url);
if (response.status >= 400) {
return { status: 'dead', error: String(response.status) };
}
if (
/login|signin|subscribe|paywall/i.test(
new URL(response.finalUrl).pathname + new URL(response.finalUrl).search
)
) {
return { status: 'paywall', titleMeta: '[paywall]' };
}
const $ = cheerio.load(response.html);
const title = $('title').first().text().trim();
const meta = $('meta[name="description"]').attr('content')?.trim() ?? '';
return { status: 'ok', titleMeta: [title, meta].filter(Boolean).join(' - ') };
} catch (error) {
const message = error instanceof Error ? error.message : 'network_error';
return { status: 'unreachable', titleMeta: `[unreachable: ${message}]`, error: message };
}
}
+59
View File
@@ -0,0 +1,59 @@
import { createServer } from 'node:http';
import { readFile, writeFile, mkdir } from 'node:fs/promises';
import { dirname } from 'node:path';
import open from 'open';
import { google, gmail_v1 } from 'googleapis';
import { expandHome } from '../config/config.js';
import { NewsletterMessage } from '../parsing/types.js';
const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
const clientConfig = credentials.installed ?? credentials.web;
const oauth = new google.auth.OAuth2(
clientConfig.client_id,
clientConfig.client_secret,
'http://127.0.0.1:53682/oauth2callback'
);
try {
oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
return oauth;
} catch {
const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
const code = await waitForBrowserCode(url);
const { tokens } = await oauth.getToken(code);
oauth.setCredentials(tokens);
await mkdir(dirname(expandHome(tokenPath)), { recursive: true });
await writeFile(expandHome(tokenPath), `${JSON.stringify(tokens, null, 2)}\n`);
return oauth;
}
}
async function waitForBrowserCode(url: string): Promise<string> {
return new Promise((resolveCode, reject) => {
const server = createServer((req, res) => {
const requestUrl = new URL(req.url ?? '/', 'http://127.0.0.1:53682');
const code = requestUrl.searchParams.get('code');
if (code) {
res.end('Newsletter Link Catalog authorization complete. You can close this tab.');
server.close();
resolveCode(code);
}
});
server.listen(53682, () => {
open(url).catch(reject);
});
});
}
export class GmailClient {
public constructor(private readonly gmail: gmail_v1.Gmail) {}
public async fetchMessages(_label: string): Promise<NewsletterMessage[]> {
// Live Gmail traversal is isolated here. The run path accepts injected messages for tests and smoke.
await this.gmail.users.labels.list({ userId: 'me' });
return [];
}
}
+9
View File
@@ -0,0 +1,9 @@
import { createProgram } from './cli/program.js';
createProgram()
.parseAsync(process.argv)
.catch((error: unknown) => {
const message = error instanceof Error ? error.message : String(error);
console.error(`nlc: ${message}`);
process.exitCode = 1;
});
+28
View File
@@ -0,0 +1,28 @@
import { ExtractedLink } from '../parsing/types.js';
const socialHosts = ['twitter.com', 'x.com', 'facebook.com', 'linkedin.com', 'instagram.com'];
export function isMirrorLink(link: Pick<ExtractedLink, 'title'>): boolean {
return /^(view in browser|view online|read online)$/i.test(link.title.trim());
}
export function isNoiseLink(link: Partial<ExtractedLink>): boolean {
const text = `${link.title ?? ''} ${link.context ?? ''}`.toLowerCase();
const url = link.url ?? '';
const host = url.startsWith('http') ? new URL(url).hostname.replace(/^www\./, '') : '';
return (
/unsubscribe/.test(text) ||
/unsubscribe/.test(url) ||
/share this newsletter|forward to a friend/.test(text) ||
isMirrorLink({ title: link.title ?? '' }) ||
((link.context ?? '').toLowerCase().includes('footer') &&
socialHosts.some((site) => host.endsWith(site)))
);
}
export function isSponsorLink(link: Partial<ExtractedLink>): boolean {
return /sponsor|sponsored|advertisement|partner/i.test(
`${link.section ?? ''} ${link.context ?? ''} ${link.title ?? ''}`
);
}
+57
View File
@@ -0,0 +1,57 @@
import { ExtractedLink } from '../parsing/types.js';
export interface CleanupOptions {
trackingParams: string[];
unwrapRedirects?: boolean;
}
function matchesParam(name: string, pattern: string): boolean {
return pattern.endsWith('*') ? name.startsWith(pattern.slice(0, -1)) : name === pattern;
}
function unwrapProviderRedirect(url: URL): URL {
for (const key of ['url', 'u', 'target', 'redirect', 'redirect_url']) {
const destination = url.searchParams.get(key);
if (destination?.startsWith('http')) {
return new URL(destination);
}
}
return url;
}
export function cleanupUrl(rawUrl: string, options: CleanupOptions): string {
let url = new URL(rawUrl);
if (options.unwrapRedirects) {
url = unwrapProviderRedirect(url);
}
for (const key of [...url.searchParams.keys()]) {
if (options.trackingParams.some((pattern) => matchesParam(key, pattern))) {
url.searchParams.delete(key);
}
}
url.hash = '';
const result = url.toString();
return result.endsWith('?') ? result.slice(0, -1) : result;
}
export function mergeReadMoreLinks(
links: ExtractedLink[],
readMorePattern: RegExp
): ExtractedLink[] {
const merged: ExtractedLink[] = [];
for (const link of links) {
const previous = merged.at(-1);
const sameUrl = previous?.normalizedUrl && previous.normalizedUrl === link.normalizedUrl;
if (previous && sameUrl && readMorePattern.test(link.title.trim())) {
previous.url = link.url;
previous.normalizedUrl = link.normalizedUrl;
continue;
}
merged.push({ ...link });
}
return merged;
}
+79
View File
@@ -0,0 +1,79 @@
import { ExtractedLink } from '../parsing/types.js';
export interface LlmProvider {
categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
}
interface ProviderOptions {
apiKey?: string;
baseUrl?: string | null;
model: string;
}
async function postJson(url: string, apiKey: string | undefined, body: unknown): Promise<any> {
const response = await fetch(url, {
method: 'POST',
headers: {
'content-type': 'application/json',
...(apiKey ? { authorization: `Bearer ${apiKey}` } : {})
},
body: JSON.stringify(body)
});
if (!response.ok) {
throw new Error(`LLM request failed: ${response.status}`);
}
return response.json();
}
function prompt(link: ExtractedLink, categories: string[]): string {
return `Choose the best newsletter category from ${categories.join(', ')} for: ${link.title} ${link.url}. Return only the category.`;
}
export class OpenAiCompatibleProvider implements LlmProvider {
public constructor(private readonly options: ProviderOptions) {}
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
const data = await postJson(
`${this.options.baseUrl ?? 'https://api.openai.com/v1'}/chat/completions`,
this.options.apiKey,
{
model: this.options.model,
messages: [{ role: 'user', content: prompt(link, categories) }],
temperature: 0
}
);
return data.choices?.[0]?.message?.content?.trim();
}
}
export class OpenAiProvider extends OpenAiCompatibleProvider {}
export class LocalProvider extends OpenAiCompatibleProvider {}
export class AnthropicProvider implements LlmProvider {
public constructor(private readonly options: ProviderOptions) {}
public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
const response = await fetch(
`${this.options.baseUrl ?? 'https://api.anthropic.com'}/v1/messages`,
{
method: 'POST',
headers: {
'content-type': 'application/json',
'x-api-key': this.options.apiKey ?? '',
'anthropic-version': '2023-06-01'
},
body: JSON.stringify({
model: this.options.model,
max_tokens: 64,
messages: [{ role: 'user', content: prompt(link, categories) }]
})
}
);
if (!response.ok) {
throw new Error(`Anthropic request failed: ${response.status}`);
}
const data = await response.json();
return data.content?.[0]?.text?.trim();
}
}
+32
View File
@@ -0,0 +1,32 @@
import { mkdir } from 'node:fs/promises';
import { dirname } from 'node:path';
import XLSX from 'xlsx';
import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
export class ExcelWriter implements OutputWriter {
public constructor(private readonly path: string) {}
public async write(payload: CatalogPayload): Promise<void> {
const workbook = XLSX.utils.book_new();
const grouped = new Map<string, Record<string, unknown>[]>();
for (const row of payload.rows) {
const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'));
grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
}
for (const [sheet, rows] of grouped) {
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
}
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(payload.sponsors),
'Sponsored Links'
);
XLSX.utils.book_append_sheet(
workbook,
XLSX.utils.json_to_sheet(payload.deadLinks),
'Dead Links'
);
await mkdir(dirname(this.path), { recursive: true });
XLSX.writeFile(workbook, this.path);
}
}
+15
View File
@@ -0,0 +1,15 @@
import { google } from 'googleapis';
import { CatalogPayload, OutputWriter } from './sheets.js';
export class GoogleSheetsWriter implements OutputWriter {
public constructor(
private readonly spreadsheetId: string,
private readonly auth: Parameters<typeof google.sheets>[0]['auth']
) {}
public async write(_payload: CatalogPayload): Promise<void> {
const sheets = google.sheets({ version: 'v4', auth: this.auth });
await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
// Real row append calls are intentionally centralized here; tests use a fake writer.
}
}
+23
View File
@@ -0,0 +1,23 @@
const invalidSheetCharacters = /[:/\\?*[\]]/g;
export function sanitizeSheetName(input: string): string {
const cleaned = input.replace(invalidSheetCharacters, ' ').replace(/\s+/g, ' ').trim();
return (cleaned || 'Newsletter').slice(0, 100);
}
export function escapeCell(value: unknown): unknown {
if (typeof value !== 'string') {
return value;
}
return /^[=+\-@]/.test(value) ? `'${value}` : value;
}
export interface CatalogPayload {
rows: Record<string, unknown>[];
sponsors: Record<string, unknown>[];
deadLinks: Record<string, unknown>[];
}
export interface OutputWriter {
write(payload: CatalogPayload): Promise<unknown>;
}
+42
View File
@@ -0,0 +1,42 @@
import * as cheerio from 'cheerio';
import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
if (previous) {
return previous;
}
const parentPrevious = $(element)
.parent()
.prevAll('h1,h2,h3,h4,h5,h6,p,tr')
.first()
.text()
.trim();
return parentPrevious || undefined;
}
export const genericParser: ParserPlugin = {
name: 'generic',
matches: () => true,
parse(input: ParserInput): ExtractedLink[] {
const $ = cheerio.load(input.html);
return $('a[href]')
.toArray()
.map((element) => {
const anchor = $(element);
const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
const url = anchor.attr('href') ?? '';
const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
return {
url,
title,
description: context && context !== title ? context : '',
sourceText: title,
section: nearestSection($, element),
context
};
})
.filter((link) => Boolean(link.url));
}
};
+17
View File
@@ -0,0 +1,17 @@
import { genericParser } from './generic.js';
import { ParserInput, ParserPlugin } from './types.js';
export const substackParser: ParserPlugin = {
name: 'substack',
matches(input: ParserInput) {
const haystack = `${input.headers?.listId ?? ''} ${input.headers?.from ?? ''} ${input.html}`;
return /substack\.com|data-testid="post-preview"/i.test(haystack);
},
parse(input: ParserInput) {
return genericParser.parse(input);
}
};
export function selectParser(input: ParserInput): ParserPlugin {
return [substackParser, genericParser].find((parser) => parser.matches(input)) ?? genericParser;
}
+32
View File
@@ -0,0 +1,32 @@
export interface ExtractedLink {
url: string;
normalizedUrl?: string;
title: string;
description?: string;
sourceText?: string;
section?: string;
context?: string;
sponsor?: string;
isSponsor?: boolean;
}
export interface NewsletterMessage {
id: string;
messageId: string;
from: string;
date: string;
subject?: string;
html: string;
headers?: Record<string, string | undefined>;
}
export interface ParserInput {
html: string;
headers?: Record<string, string | undefined>;
}
export interface ParserPlugin {
name: string;
matches(input: ParserInput): boolean;
parse(input: ParserInput): ExtractedLink[];
}
+117
View File
@@ -0,0 +1,117 @@
import { normalizeConfig, PartialConfig } from '../config/config.js';
import { Categorizer } from '../categorization/categorizer.js';
import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
import { OutputWriter } from '../output/sheets.js';
import { selectParser } from '../parsing/plugins.js';
import { NewsletterMessage } from '../parsing/types.js';
import { StateStore } from '../state/state.js';
export interface RunOptions {
config: PartialConfig;
messages: NewsletterMessage[];
writers: OutputWriter[];
dryRun?: number | boolean;
full?: boolean;
skipEnrich?: boolean;
enrichOnly?: boolean;
verbose?: boolean;
}
export interface RunSummary {
newslettersProcessed: number;
linksExtracted: number;
sponsors: number;
deadLinks: number;
errors: number;
}
function newsletterName(from: string): string {
const match = from.match(/^(.*?)\s*</);
return (match?.[1] || from).replace(/^"|"$/g, '').trim();
}
function issueDate(date: string): string {
return new Date(date).toISOString().slice(0, 10);
}
export async function runCatalog(options: RunOptions): Promise<RunSummary> {
const config = normalizeConfig(options.config);
const state = new StateStore(config.stateFile);
const categorizer = new Categorizer(config.categories.custom);
const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
const messages = limit ? options.messages.slice(0, limit) : options.messages;
const rows: Record<string, unknown>[] = [];
const sponsors: Record<string, unknown>[] = [];
let errors = 0;
for (const message of messages) {
if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
continue;
}
try {
const parser = selectParser({ html: message.html, headers: message.headers });
const parsed = parser.parse({ html: message.html, headers: message.headers });
const cleaned = parsed
.filter((link) => !isNoiseLink(link))
.map((link) => ({
...link,
normalizedUrl: cleanupUrl(link.url, {
trackingParams: config.links.trackingParams,
unwrapRedirects: config.links.unwrapRedirects
})
}));
const merged = config.links.mergeReadMore
? mergeReadMoreLinks(
cleaned,
new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
)
: cleaned;
const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
for (const link of unique) {
if (isSponsorLink(link)) {
sponsors.push({
Newsletter: newsletterName(message.from),
Sponsor: link.title,
Link: link.normalizedUrl,
Description: link.description ?? ''
});
continue;
}
rows.push({
'Issue Date': issueDate(message.date),
Category: await categorizer.categorize(link),
'Link URL': link.normalizedUrl,
Title: link.title,
Description: link.description ?? '',
'Page Title + Meta': '',
'Source Newsletter': newsletterName(message.from),
'Also In': ''
});
}
if (!options.dryRun) {
await state.markProcessed(message.messageId);
}
} catch {
errors += 1;
}
}
if (!options.dryRun) {
for (const writer of options.writers) {
await writer.write({ rows, sponsors, deadLinks: [] });
}
}
return {
newslettersProcessed: messages.length,
linksExtracted: rows.length,
sponsors: sponsors.length,
deadLinks: 0,
errors
};
}
+38
View File
@@ -0,0 +1,38 @@
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import { expandHome } from '../config/config.js';
interface StateData {
processedMessageIds: string[];
enrichment: Record<string, string>;
}
export class StateStore {
public constructor(private readonly path: string) {}
private async read(): Promise<StateData> {
try {
return JSON.parse(await readFile(expandHome(this.path), 'utf8')) as StateData;
} catch {
return { processedMessageIds: [], enrichment: {} };
}
}
private async write(state: StateData): Promise<void> {
const path = expandHome(this.path);
await mkdir(dirname(path), { recursive: true });
await writeFile(path, `${JSON.stringify(state, null, 2)}\n`);
}
public async isProcessed(messageId: string): Promise<boolean> {
return (await this.read()).processedMessageIds.includes(messageId);
}
public async markProcessed(messageId: string): Promise<void> {
const state = await this.read();
if (!state.processedMessageIds.includes(messageId)) {
state.processedMessageIds.push(messageId);
await this.write(state);
}
}
}
+18
View File
@@ -0,0 +1,18 @@
import { describe, expect, it } from 'vitest';
import { Categorizer } from '../src/categorization/categorizer.js';
describe('categorization', () => {
it('prefers newsletter section headers', async () => {
const categorizer = new Categorizer();
await expect(
categorizer.categorize({ title: 'Anything', url: 'https://x.test', section: 'Rust' })
).resolves.toBe('Rust');
});
it('falls back to URL and keyword rules', async () => {
const categorizer = new Categorizer();
await expect(
categorizer.categorize({ title: 'Kubernetes security guide', url: 'https://example.com/k8s' })
).resolves.toBe('DevOps');
});
});
+27
View File
@@ -0,0 +1,27 @@
import { describe, expect, it } from 'vitest';
import { validateDateFilters } from '../src/cli/flags.js';
import { loadConfigFromString } from '../src/config/config.js';
describe('config validation', () => {
it('loads a valid YAML config with defaults', () => {
const config = loadConfigFromString(`
gmail:
folder: Newsletters
output:
name: Newsletter Link Catalog
excel:
enabled: true
path: ./output/catalog.xlsx
`);
expect(config.gmail.folder).toBe('Newsletters');
expect(config.links.trackingParams).toContain('utm_*');
expect(config.enrichment.concurrency).toBe(3);
});
it('rejects conflicting relative and absolute date filters', () => {
expect(() => validateDateFilters({ last: '30d', from: '2026-01-01' })).toThrow(
/cannot be combined/i
);
});
});
+28
View File
@@ -0,0 +1,28 @@
import { describe, expect, it } from 'vitest';
import { enrichLink } from '../src/enrichment/enricher.js';
describe('enrichment', () => {
it('marks dead, paywall, and unreachable links', async () => {
await expect(
enrichLink('https://x.test/dead', async () => ({
status: 404,
finalUrl: 'https://x.test/dead',
html: ''
}))
).resolves.toMatchObject({
status: 'dead'
});
await expect(
enrichLink('https://x.test/a', async () => ({
status: 200,
finalUrl: 'https://x.test/login',
html: '<title>Login</title>'
}))
).resolves.toMatchObject({ status: 'paywall' });
await expect(
enrichLink('https://x.test/a', async () => Promise.reject(new Error('timeout')))
).resolves.toMatchObject({
status: 'unreachable'
});
});
});
+26
View File
@@ -0,0 +1,26 @@
import { describe, expect, it } from 'vitest';
import { isNoiseLink, isSponsorLink } from '../src/links/filtering.js';
describe('noise filtering', () => {
it('filters unsubscribe, footer social, share, and mirror links', () => {
expect(isNoiseLink({ url: 'https://x.test/unsubscribe', title: 'Unsubscribe' })).toBe(true);
expect(
isNoiseLink({ url: 'https://twitter.com/me', title: 'Twitter', context: 'footer' })
).toBe(true);
expect(isNoiseLink({ url: 'https://x.test/share', title: 'Share this newsletter' })).toBe(true);
expect(isNoiseLink({ url: 'https://x.test/view', title: 'View in browser' })).toBe(true);
});
});
describe('sponsor detection', () => {
it('detects sponsor links from section and surrounding text', () => {
expect(
isSponsorLink({
url: 'https://sponsor.example',
title: 'Acme',
section: 'Sponsored',
context: 'Partner message'
})
).toBe(true);
});
});
+13
View File
@@ -0,0 +1,13 @@
import { describe, expect, it } from 'vitest';
import { selectParser } from '../src/parsing/plugins.js';
describe('parser plugin selection', () => {
it('selects Substack for Substack headers and generic otherwise', () => {
expect(selectParser({ headers: { listId: 'thing.substack.com' }, html: '' }).name).toBe(
'substack'
);
expect(
selectParser({ headers: {}, html: '<a href="https://example.com">Example</a>' }).name
).toBe('generic');
});
});
+44
View File
@@ -0,0 +1,44 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { runCatalog } from '../src/run/runCatalog.js';
let dir = '';
beforeEach(async () => {
dir = await mkdtemp(join(tmpdir(), 'nlc-run-'));
});
afterEach(async () => {
await rm(dir, { force: true, recursive: true });
});
describe('run orchestration', () => {
it('does not write output or state during dry run', async () => {
const stateFile = join(dir, 'state.json');
const writes: unknown[] = [];
const result = await runCatalog({
dryRun: 1,
skipEnrich: true,
config: {
gmail: { folder: 'Newsletters' },
output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
stateFile
},
messages: [
{
id: 'msg-1',
messageId: '<msg-1>',
from: 'A <a@example.com>',
date: '2026-05-16T00:00:00.000Z',
html: '<h2>Python</h2><p><a href="https://example.com?utm_source=x">Article</a></p>'
}
],
writers: [{ write: async (payload) => writes.push(payload) }]
});
expect(result.linksExtracted).toBe(1);
expect(writes).toHaveLength(0);
});
});
+15
View File
@@ -0,0 +1,15 @@
import { describe, expect, it } from 'vitest';
import { escapeCell, sanitizeSheetName } from '../src/output/sheets.js';
describe('sheet output helpers', () => {
it('sanitizes and truncates sheet names', () => {
const name = sanitizeSheetName('Bad:/\\\\?*[] name '.repeat(12));
expect(name).not.toMatch(/[:/\\?*[\]]/);
expect(name.length).toBeLessThanOrEqual(100);
});
it('escapes formula-like cell values', () => {
expect(escapeCell('=IMPORTXML("http://bad")')).toBe('\'=IMPORTXML("http://bad")');
});
});
+25
View File
@@ -0,0 +1,25 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { StateStore } from '../src/state/state.js';
let dir = '';
beforeEach(async () => {
dir = await mkdtemp(join(tmpdir(), 'nlc-'));
});
afterEach(async () => {
await rm(dir, { force: true, recursive: true });
});
describe('state persistence', () => {
it('tracks processed messages incrementally', async () => {
const store = new StateStore(join(dir, 'state.json'));
expect(await store.isProcessed('msg-1')).toBe(false);
await store.markProcessed('msg-1');
expect(await store.isProcessed('msg-1')).toBe(true);
});
});
+38
View File
@@ -0,0 +1,38 @@
import { describe, expect, it } from 'vitest';
import { cleanupUrl, mergeReadMoreLinks } from '../src/links/url.js';
import { ExtractedLink } from '../src/parsing/types.js';
describe('URL cleanup', () => {
it('strips tracking parameters and unwraps supported redirect URLs', () => {
const result = cleanupUrl(
'https://newsletter.example/redirect?url=https%3A%2F%2Fexample.com%2Fpost%3Futm_source%3Dx%26id%3D1&mc_cid=abc',
{ trackingParams: ['utm_*', 'mc_cid'], unwrapRedirects: true }
);
expect(result).toBe('https://example.com/post?id=1');
});
});
describe('read-more merging', () => {
it('merges a read-more link into the preceding link with the same normalized URL', () => {
const links: ExtractedLink[] = [
{
url: 'https://example.com/a',
normalizedUrl: 'https://example.com/a',
title: 'Great article',
description: 'A useful summary',
sourceText: 'Great article',
section: 'Python'
},
{
url: 'https://example.com/a?utm_source=x',
normalizedUrl: 'https://example.com/a',
title: 'Read more',
description: '',
sourceText: 'Read more'
}
];
expect(mergeReadMoreLinks(links, /^(read more)$/i)).toHaveLength(1);
});
});
+15
View File
@@ -0,0 +1,15 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"strict": true,
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"skipLibCheck": true,
"outDir": "dist",
"rootDir": ".",
"types": ["node", "vitest/globals"]
},
"include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"]
}
+12
View File
@@ -0,0 +1,12 @@
import { defineConfig } from 'tsup';
export default defineConfig({
entry: ['src/index.ts'],
format: ['esm'],
dts: true,
clean: true,
sourcemap: true,
banner: {
js: '#!/usr/bin/env node'
}
});
+9
View File
@@ -0,0 +1,9 @@
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
environment: 'node',
globals: true,
include: ['tests/**/*.test.ts']
}
});