✨feature: First push to git

2026-05-16 14:02:49 -05:00
commit 265f69d95a
46 changed files with 11551 additions and 0 deletions
@@ -0,0 +1,49 @@
 module.exports = {
  root: true,
  ignorePatterns: ['dist/**', 'node_modules/**'],
  env: {
    es2022: true,
    node: true
  },
  parser: '@typescript-eslint/parser',
  parserOptions: {
    project: './tsconfig.json',
    sourceType: 'module'
  },
  plugins: ['@typescript-eslint', 'import'],
  extends: ['airbnb-base', 'plugin:@typescript-eslint/recommended', 'prettier'],
  rules: {
    'import/extensions': 'off',
    'import/no-extraneous-dependencies': [
      'error',
      {
        devDependencies: ['tests/**/*.ts', 'vitest.config.ts', 'tsup.config.ts', 'scripts/**/*.mjs']
      }
    ],
    'no-console': 'off',
    'no-restricted-syntax': 'off',
    'class-methods-use-this': 'off',
    camelcase: 'off',
    'default-param-last': 'off',
    'import/no-unresolved': 'off',
    'import/prefer-default-export': 'off',
    'max-classes-per-file': 'off',
    'no-await-in-loop': 'off',
    'no-continue': 'off',
    'no-empty-function': 'off',
    'no-use-before-define': 'off',
    'no-useless-constructor': 'off',
    '@typescript-eslint/no-explicit-any': 'off',
    '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }]
  },
  overrides: [
    {
      files: ['*.js', '*.cjs', '*.mjs'],
      parser: 'espree',
      parserOptions: {
        ecmaVersion: 2022,
        sourceType: 'module'
      }
    }
  ]
 };
@@ -0,0 +1,3 @@
 node_modules
 dist
 config.yaml
@@ -0,0 +1,7 @@
 .git
 node_modules
 dist
 .vscode
 notes
 PROMPT.md
 SPEC.md
@@ -0,0 +1,5 @@
 {
  "singleQuote": true,
  "trailingComma": "none",
  "printWidth": 100
 }
@@ -0,0 +1,16 @@
 {
    "workbench.colorCustomizations": {
        "tree.indentGuidesStroke": "#3d92ec",
        "activityBar.background": "#053610",
        "titleBar.activeBackground": "#074B17",
        "titleBar.activeForeground": "#EEFDF1",
        "titleBar.inactiveBackground": "#053610",
        "titleBar.inactiveForeground": "#EEFDF1",
        "statusBar.background": "#053610",
        "statusBar.foreground": "#EEFDF1",
        "statusBar.debuggingBackground": "#053610",
        "statusBar.debuggingForeground": "#EEFDF1",
        "statusBar.noFolderBackground": "#053610",
        "statusBar.noFolderForeground": "#EEFDF1"
    }
 }
@@ -0,0 +1,68 @@
 # Newsletter Link Catalog
 `nlc` is a TypeScript/Node.js CLI for cataloging links from newsletters in a configured Gmail label into Google Sheets and/or a local Excel workbook.
 ## Commands
 ```bash
 nlc init
 nlc run --dry-run
 nlc run
 nlc run --from 2026-05-01 --to 2026-05-16
 nlc run --last 30d
 nlc run --enrich-only
 ```
 ## Setup
 1. Install dependencies with `npm install`.
 2. Run `npm run build`.
 3. Run `node dist/index.js init` to create `config.yaml`.
 4. Place OAuth client JSON files in the configured local paths, typically:
   - `~/.nlc/gmail-credentials.json`
   - `~/.nlc/sheets-credentials.json`
 5. Run `node dist/index.js run --dry-run` before live writes.
 Tokens are persisted locally under `~/.nlc` and must not be committed.
 ## Configuration
 Start from [config.example.yaml](config.example.yaml). The important choices are:
 - `gmail.folder`: the single Gmail label/folder to process.
 - `output.excel.enabled`: writes a local `.xlsx` file.
 - `output.sheets_api.enabled`: enables Google Sheets integration when credentials and spreadsheet ID are configured.
 - `links.tracking_params`: query parameters stripped during URL normalization.
 - `categories.llm`: optional BYOK categorization provider.
 ## Build and Distribution
 The build uses `tsup` for the JavaScript bundle and `@yao-pkg/pkg` for the standalone executable:
 ```bash
 npm run build
 ```
 This bundles `src/index.ts` to `dist/index.js`, adds a Node shebang, emits types, and packages the current-platform executable as `dist/nlc.exe` on Windows or `dist/nlc` on macOS/Linux. The packaged artifact embeds the Node runtime for operational use without a separate Node install.
 ## Validation
 Local validation does not need Gmail, Sheets, or LLM credentials:
 ```bash
 npm run lint
 npm run format:check
 npm run typecheck
 npm test
 npm run build
 npm run smoke
 ```
 `npm run smoke` exercises `nlc --help`, `nlc init --help`, `nlc run --help`, and a fixture-backed dry run.
 ## Safety Notes
 - Formula-like spreadsheet cells are escaped before output.
 - Dry runs do not write output files or state.
 - Live integrations are isolated behind adapters so tests use fakes.
 - Individual email/link failures are counted and processing continues; critical config/write failures stop the command.
@@ -0,0 +1,74 @@
 gmail:
  folder: 'Newsletters'
  credentials: '~/.nlc/gmail-credentials.json'
  token: '~/.nlc/gmail-token.json'
 output:
  name: 'Newsletter Link Catalog'
  sheets_api:
    enabled: false
    credentials: '~/.nlc/sheets-credentials.json'
    token: '~/.nlc/sheets-token.json'
    spreadsheet_id: ''
  excel:
    enabled: true
    path: './output/newsletter-catalog.xlsx'
 newsletters:
  'sender@example.com':
    display_name: 'Example Newsletter'
    date_override: 'subject'
    date_format: '%B %d, %Y'
 links:
  unwrap_redirects: true
  strip_utm: true
  tracking_params:
    - 'utm_*'
    - 'fbclid'
    - 'gclid'
    - 'mc_cid'
    - 'mc_eid'
  redirect_limit: 5
  read_more_pattern: '(?i)^(read more|continue reading|learn more)$'
  share_patterns:
    - '(?i)share'
    - '(?i)forward to a friend'
  sponsor_markers:
    - '(?i)sponsor'
    - '(?i)sponsored'
    - '(?i)advertisement'
    - '(?i)partner'
  filter_unsubscribe: true
  filter_social_footer: true
  filter_share_links: true
  merge_read_more: true
 categories:
  custom:
    - 'AI/ML'
    - 'Career'
    - 'Rust'
  llm:
    provider: 'anthropic'
    model: 'claude-sonnet-4-6'
    api_key_env: 'ANTHROPIC_API_KEY'
    base_url: null
    failure_category: 'Uncategorized'
 enrichment:
  enabled: true
  concurrency: 3
  delay_ms: 1500
  retries: 2
  timeout_ms: 10000
 rate_limit:
  gmail_qps: 5
  link_concurrency: 3
 state_file: '~/.nlc/state.json'
 plugins:
  substack:
    enabled: true
@@ -0,0 +1,63 @@
 # Newsletter Link Catalog Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Build a production-quality TypeScript CLI named `nlc` that extracts newsletter links from Gmail, cleans and categorizes them, enriches metadata, writes Google Sheets and Excel outputs, and supports credential-free local validation.
 **Architecture:** The CLI is split into small modules for command parsing, config validation, Gmail access, parsing plugins, link cleanup, categorization, LLM adapters, output writers, enrichment, state, and orchestration. Live external services are isolated behind interfaces so tests and smoke runs use fixtures and fakes.
 **Tech Stack:** Node.js, TypeScript, Commander, Cheerio, Zod, googleapis, xlsx, Vitest, ESLint, Prettier, tsup.
 ---
 ### Task 1: Scaffold and Tests
 **Files:**
 - Create: `package.json`, `tsconfig.json`, `.eslintrc.cjs`, `.prettierrc.json`, `vitest.config.ts`
 - Create tests in `tests/*.test.ts`
 - [x] Write tests covering config validation, date conflicts, sheet names, URL cleanup, read-more merging, noise filtering, sponsor detection, categorization, state behavior, enrichment statuses, dry-run suppression, and parser plugin selection.
 - [x] Run tests once to confirm they fail before production modules exist.
 ### Task 2: Core Modules
 **Files:**
 - Create modules under `src/config`, `src/links`, `src/parsing`, `src/categorization`, `src/state`, `src/enrichment`, `src/output`
 - [x] Implement the minimal production behavior required for the tests.
 - [x] Keep integrations behind interfaces and dependency injection.
 ### Task 3: CLI and Integrations
 **Files:**
 - Create `src/index.ts`, `src/cli/*`, `src/gmail/*`, `src/llm/*`, `src/run/*`, `scripts/smoke.mjs`
 - [x] Implement `nlc init`, `nlc run`, help text, date flag validation, dry run, enrichment-only, and fixture-backed smoke execution.
 - [x] Add OAuth/browser-flow boundaries for Gmail and Sheets without requiring live credentials during automated tests.
 - [x] Add provider adapters for Anthropic, OpenAI, local endpoints, and OpenAI-compatible endpoints.
 ### Task 4: Docs and Build
 **Files:**
 - Create `README.md`, `config.example.yaml`
 - [x] Document setup, OAuth files, token persistence, Google Sheets and Excel output, binary build tooling, and local smoke flow.
 - [x] Configure `npm run build` to compile and bundle the CLI.
 ### Task 5: Validation
 **Commands:**
 - [ ] `npm install`
 - [ ] `npm run lint`
 - [ ] `npm run format:check`
 - [ ] `npm run typecheck`
 - [ ] `npm test`
 - [ ] `npm run build`
 - [ ] `npm run smoke`
 **Self-review:** The plan maps every SPEC.md subsystem to a module or integration boundary. Live Gmail, Sheets, and LLM validation require user credentials, so local validation uses fakes and fixtures while docs describe operational setup.
@@ -0,0 +1,158 @@
 /goal
 <task>
  You are an autonomous senior engineer working in:
  C:\Users\ksolo\Projects\Misc Projects\Newletter Link Catalog
  Implement the Newsletter Link Catalog CLI described in SPEC.md end-to-end.
  The expected product is a TypeScript/Node.js CLI named `nlc` with:
  - `nlc init`
  - `nlc run [flags]`
  - Gmail OAuth browser auth and local token persistence
  - config-driven Gmail label/folder processing
  - HTML newsletter parsing and link extraction
  - noise filtering, tracking URL cleanup, redirect unwrapping, read-more merging
  - hybrid categorization using section headers, rules, and optional LLM providers
  - parser plugin architecture with a generic parser and Substack plugin
  - Google Sheets and local `.xlsx` outputs
  - incremental state tracking in JSON
  - enrichment pass for page title/meta, dead-link handling, paywall/unreachable markers
  - dry-run, date filtering, full reprocess, skip-enrich, enrich-only, config, and verbose flags
  - standalone binary build script and documentation for the selected bundling tool
  Use SPEC.md as the source of truth. If existing code conflicts with SPEC.md, prefer SPEC.md unless repo-local instructions explicitly require otherwise.
  The repository-level working agreements mention PHP tooling, but this project spec is TypeScript/Node.js. Apply the relevant JS quality rules: ESLint airbnb/base, Prettier, tests, secure input/output
  handling, and CI-style validation. Do not add PHP tooling unless PHP files already exist and require it.
 </task>
 <goal>
  Build a production-quality CLI that meets the SPEC.md requirements, adheres to the working agreements, and can be used by the repository owner to catalog newsletter links from Gmail into Google Sheets with confidence in correctness, safety, and maintainability.
 </goal>
 <default_follow_through_policy>
  Default to the most reasonable low-risk interpretation and keep going.
  Only stop to ask when a missing detail changes correctness, safety, external credentials, or an irreversible action.
  When external services or credentials are unavailable, implement the integration boundary, tests, mocks, and clear setup docs instead of blocking.
 </default_follow_through_policy>
 <completeness_contract>
  Persist until the task is fully handled end-to-end within the current turn whenever feasible: do not stop at analysis or partial fixes.
  Treat the task as incomplete until every major SPEC.md behavior is implemented, tested, documented, or explicitly marked [blocked] with evidence.
  Before finishing, reconcile every plan item: Done, Blocked, or Cancelled. Never leave items in-progress.
  Do not claim completion until validation has run and failures are fixed or explained with concrete blocker evidence.
 </completeness_contract>
 <missing_context_gating>
  Read SPEC.md and inspect the repository before planning implementation.
  Do not guess repository structure, package manager, test framework, or existing scripts. Retrieve them with tools.
  If the repo is empty or nearly empty, scaffold a TypeScript CLI project using npm unless an existing package manager is present.
  If credentials, live Gmail, Google Sheets, or LLM API keys are missing, use mocks/fakes for automated tests and document the required environment variables and setup.
 </missing_context_gating>
 <tool_persistence_rules>
  Prefer dedicated tools over raw shell where available: rg, read_file/list_dir equivalents, apply_patch, and update_plan.
  Use rg or rg --files for search.
  Parallelize independent file reads; sequence dependent actions.
  Use apply_patch for manual edits.
  Keep using tools until you have enough evidence to finish confidently.
 </tool_persistence_rules>
 <implementation_requirements>
  Implement a clean modular architecture, with separate modules for:
  - CLI command parsing
  - config loading and validation
  - Gmail OAuth/auth/client access
  - Gmail message fetching by configured label
  - HTML parsing and extraction
  - noise filtering
  - URL normalization, redirect unwrapping, and tracking parameter stripping
  - categorization
  - LLM provider adapters
  - parser plugins
  - spreadsheet writers
  - enrichment
  - state persistence
  - logging/progress reporting
  Implement provider adapters for:
  - Anthropic
  - OpenAI
  - local/Ollama or LM Studio style endpoints
  - OpenAI-compatible endpoints
  Implement output writers for:
  - Google Sheets API
  - local Excel `.xlsx`
  Implement tests for core behavior without requiring live external services:
  - config validation
  - date filter conflict handling
  - sheet-name sanitization/truncation
  - URL cleanup and tracking parameter stripping
  - read-more link merging
  - noise filtering
  - sponsor detection
  - section-header categorization
  - fallback rule categorization
  - state-file incremental behavior
  - dead/paywall/unreachable enrichment handling
  - dry-run state/write suppression
  - parser plugin selection, including Substack
 </implementation_requirements>
 <action_safety>
  Keep changes tightly scoped to building this CLI.
  Avoid unrelated refactors, renames, or cleanup.
  Do not run destructive git commands such as reset --hard or checkout -- without explicit approval.
  Never commit secrets, tokens, OAuth credentials, spreadsheet IDs, or user data.
  Persist tokens only in documented local paths such as ~/.nlc.
  Sanitize config and CLI inputs. Escape or safely encode spreadsheet cell values that could become formulas.
  Handle critical errors by stopping with a useful message. Handle individual email/link failures by logging and continuing.
 </action_safety>
 <verification_loop>
  Required validations:
  - npm install
  - npm run lint
  - npm run format:check
  - npm run typecheck
  - npm test
  - npm run build
  - npm run smoke
  If the package scripts do not exist yet, create them.
  `npm run build` must compile the TypeScript project and produce the standalone binary or packaged executable artifact described in the docs.
  `npm run smoke` must exercise the CLI without live credentials, at minimum:
  - `nlc --help`
  - `nlc init --help`
  - `nlc run --help`
  - a dry-run or fixture-backed run path that proves parsing/output orchestration works without mutating real Gmail or Sheets.
  Before finalizing, run the required validations.
  If a check fails, fix the cause and rerun until green or until a real external blocker remains.
  Report any unavailable live-service validation separately from local automated validation.
 </verification_loop>
 <progress_updates>
  For long work, give brief progress updates after meaningful milestones:
  - repo inspection complete
  - implementation plan formed
  - core modules scaffolded
  - tests added
  - validations running
  - final validation result
  Keep updates concise and outcome-based.
 </progress_updates>
 <structured_output_contract>
  Final report exactly in this order:
  1. Summary: 2-4 bullets describing what was built.
  2. Changed files: one line per important file or directory.
  3. Validations: each command run and its result.
  4. Blockers or residual risks: include only real remaining issues.
  5. Next operational steps: credential/setup steps needed for live Gmail or Google Sheets use.
  Keep the final report compact and highest-signal first.
 </structured_output_contract>
@@ -0,0 +1,378 @@
 # Newsletter Link Catalog — Specification
 ## Overview
 A CLI tool that extracts links from newsletters in a designated Gmail folder, categorizes them, enriches them with metadata, and compiles them into a spreadsheet. Each newsletter gets its own sheet, links are organized by issue date and category, and sponsor links are tracked separately.
 ## Architecture
 ### Language & Runtime
 - **TypeScript/Node.js** — compiled to a standalone binary by the project build script
 - CLI tool invoked as `nlc run [flags]`
 ### Distribution
 - Standalone binary — no Node runtime required on the host machine
 - Built and packaged via CI or build script
 - The build script must document the selected bundling tool and produce the binary from a clean checkout
 ### Run Modes
 - **Manual**: Run `nlc run` on demand with optional date filters
 - **Scheduled**: Can be run via cron/Task Scheduler for recurring processing
 - Designed for both; no daemon mode required
 ## Gmail Integration
 ### Authentication
 - **OAuth2 browser flow** — user authorizes via browser, tokens persisted locally
 - `nlc init` command walks through OAuth setup interactively
 ### Scope
 - Processes emails from a **single designated Gmail folder/label** (configured in `config.yaml`)
 - Does not scan the entire inbox or search by sender patterns
 ### Email Processing
 - **HTML only** — plain-text parts are ignored
 - **Image-only emails** (single image, no extractable links) are skipped with a warning logged
 - **"View in browser" emails** — if the email contains no content links after noise filtering and contains a mirror link with anchor text matching `view in browser`, `view online`, or `read online`, fetch that mirror URL and extract links from the fetched HTML instead
 - Incremental by default: tracks processed Message-IDs in a local state file, only processes new emails
 - `--full` flag forces reprocessing of all emails that match the configured label and any date filters
 ## Link Extraction & Processing
 ### Extraction Pipeline
 1. Fetch emails from the configured Gmail folder (incremental or full)
 2. Parse HTML to extract links, section headers, and surrounding text. A section header is the nearest preceding heading-like element (`h1`-`h6`, table row header, or bold standalone line) within the same content block.
 3. Filter out noise links: unsubscribe, social footer icons, "share this newsletter" links
 4. Unwrap supported tracking redirects and strip configured tracking query parameters — store the normalized destination URL
 5. Merge "Read more" links with their preceding content (detected by: consecutive links with the same normalized URL and anchor text matching the configured read-more pattern)
 6. Categorize each link (see Categorization section)
 7. Write to spreadsheet (see Output section)
 ### Noise Filtering
 The following link types are **excluded** from content sheets:
 - Unsubscribe links
 - Social media links in footer or sharing blocks
 - Links whose anchor text or accessible label matches configured share/forward patterns
 - "View in browser" mirror links (content is extracted from the web version instead)
 Sponsor/ad links are **not filtered** — they go to a separate sheet when the link is inside a block labeled with configured sponsor markers such as "sponsor", "sponsored", "ad", "advertisement", or "partner".
 ### URL Handling
 - Unwrap HTTP redirects and supported provider redirect URLs up to the configured redirect limit
 - Strip configured tracking query parameters, including `utm_*`, `fbclid`, `gclid`, `mc_cid`, `mc_eid`, and provider-specific tracking parameters listed in config
 - Store the normalized destination URL after redirect unwrapping and query cleanup
 - Dead/broken links (4xx/5xx during enrichment) are written to the "Dead Links" sheet and removed from content sheets when they were already written by an earlier phase or run
 ### "Read More" Merging
 When two consecutive extracted links point to the same normalized URL and one anchor text matches the configured read-more pattern, they are merged into a single entry combining the preceding link title/description with the read-more link URL.
 ## Categorization
 ### Strategy: Hybrid
 1. **Primary**: Use the newsletter's own section headers (e.g., "Python", "DevOps", "Career") as categories
 2. **Fallback**: When section headers aren't available or don't cover a link, use rule-based classification (URL patterns + keywords)
 3. **Final fallback**: LLM-based categorization when rules don't match
 ### Category Taxonomy
 - Built-in base taxonomy shipped with the tool for common dev categories (Python, JavaScript, DevOps, Security, etc.)
 - User can extend via config with custom categories
 - For fallback categorization, the LLM is instructed to prefer configured categories and may create a new category only when no existing category fits
 ### LLM Provider Support (BYOK)
 The tool supports a provider adapter interface and ships adapters for:
 - **Claude/Anthropic** — Anthropic API
 - **OpenAI/GPT** — OpenAI API
 - **Local models** — Ollama, LM Studio
 - **OpenAI-compatible endpoints** — Mistral, Groq, Together, etc.
 Provider config includes: API key environment variable, base URL when required, model name, and optional provider parameters.
 ### Newsletter Parsing: Plugin System
 - Generic HTML parser as the default
 - Platform-specific parsers loaded as plugins (detected by URL patterns or email headers)
 - **Substack** shipped as the first plugin — maps Substack-specific HTML structures to the common extracted-link format
 - Additional parsers can be added as plugins without modifying core logic
 ## Output: Spreadsheet
 ### Supported Formats
 - **Google Sheets** — via Google Sheets API (live, shareable, updated by each write run)
 - **Local Excel (.xlsx)** — written to disk, can be uploaded manually
 Config selects which output(s) to use; both can be active simultaneously.
 ### Spreadsheet Name
 - Fixed name set in `config.yaml` (e.g., "Newsletter Link Catalog")
 ### Sheet Naming
 - Each newsletter gets its own sheet named after the parsed display name from the email's From header
 - Names truncated to fit Google Sheets' 100-character limit
 - Characters invalid for Google Sheets or Excel sheet names are replaced with spaces, then repeated whitespace is collapsed
 ### Content Sheet Columns
 Every link occurrence is written as a flat row; blank grouping rows are not used. Fields unavailable from the source are written as empty cells.
 | Column | Description |
 |---|---|
 | Issue Date | Date from email's Date header (overridable per-newsletter) |
 | Category | Assigned category (from newsletter sections, rules, or LLM) |
 | Link URL | Clean canonical URL after unwrapping and UTM removal |
 | Title | Anchor text / headline from the newsletter |
 | Description | 1-2 sentence description from the newsletter (if present) |
 | Page Title + Meta | `<title>` and meta description from the destination page (enrichment phase) |
 | Source Newsletter | Name of the newsletter this link came from |
 | Also In | Cross-reference: other newsletters that also mentioned this link |
 ### Sponsor Sheet (Consolidated)
 Single sheet named "Sponsored Links" containing sponsor/ad links from all newsletters:
 | Column | Description |
 |---|
 | Newsletter | Which newsletter this sponsor link appeared in |
 | Sponsor | Sponsor name (parsed from newsletter) |
 | Link | Sponsor's link URL |
 | Description | Sponsor description from the newsletter |
 ### Dead Links Sheet
 Single sheet named "Dead Links" for links that returned errors during enrichment:
 | Column | Description |
 |---|
 | URL | The clean canonical URL |
 | Status | HTTP status or error type (404, 403, timeout, etc.) |
 | Source | Newsletter name |
 | Date | Issue date |
 ### Cross-References
 - Duplicates across newsletters are kept in their respective sheets (all occurrences preserved)
 - The **Also In** column annotates each row with other newsletter issues that mentioned the same normalized URL, formatted as `Newsletter Name (YYYY-MM-DD)` and joined with `; `
 - This enables finding cross-newsletter coverage without a separate consolidated sheet
 ### No "All Links" Master Sheet
 Only per-newsletter content sheets, plus the consolidated Sponsor and Dead Links sheets. No "All Links" aggregation sheet.
 ## Enrichment
 ### Two-Phase Approach
 1. **Phase 1 (Store)**: Extract links from newsletters, categorize, and write to spreadsheet with all available in-newsletter metadata
 2. **Phase 2 (Enrich)**: Separate pass to fetch each link's destination page for `<title>` and meta description
 Enrichment can be run independently from extraction and spreadsheet writing.
 ### Enrichment Details
 - Configurable concurrency with defaults of 3 parallel requests and 1500 ms delay between batches
 - Retries on transient failures
 - Dead links (4xx/5xx) are written to the Dead Links sheet and removed from content sheets when they were already written by an earlier phase or run
 - Skip pages that redirect to a URL whose path or query contains `login`, `signin`, `subscribe`, or `paywall` — mark with "paywall" status
 - Progress bar updates after each completed enrichment request
 ### Link Liveness
 - Dead links are **not included** in content sheets — they go to the Dead Links sheet
 - Paywalled links are included in content sheets and the Page Title + Meta column is set to `[paywall]`
 - Timeout, DNS, TLS, and network failures are included in content sheets and the Page Title + Meta column is set to `[unreachable: error_type]`
 ## Processing Model
 ### Incremental Processing
 - Local state file (JSON) tracks processed Message-IDs and enrichment status
 - On subsequent runs, only new/unprocessed emails are fetched
 - `--full` flag forces reprocessing of all emails that match the configured label and any date filters
 - State file location: `~/.nlc/state.json` (or configured path)
 ### Date Filtering
 - `--from YYYY-MM-DD` and `--to YYYY-MM-DD` — absolute date range
 - `--last N` (e.g., `--last 30d`, `--last 7d`) — relative date range
 - Date filters apply before the incremental processed-message check
 - If both `--last` and `--from`/`--to` are provided, the CLI exits with a config error
 ### Dry Run
 - `--dry-run` processes the most recent N emails (default: 5) without writing to the spreadsheet
 - Shows what would be extracted, categorized, and written
 - Dry run does not update the state file or call destination pages for enrichment unless `--dry-run` is combined with `--enrich-only`
 ### Error Handling
 - **Critical errors** (Gmail auth failure, spreadsheet write failure, config errors) → stop execution
 - **Individual errors** (one link fails to enrich, one email fails to parse) → log and continue
 - Summary at end includes error counts and details
 ### Progress & Logging
 - Progress bar during processing (emails fetched, links extracted, enrichment status)
 - Summary stats at the end: newsletters processed, links extracted, duplicates found, dead links, sponsors, errors
 ## CLI Interface
 ### Commands
 ```
 nlc init          # Interactive setup: OAuth, config file, connectivity test
 nlc run [flags]   # Main processing command
 ```
 ### `nlc run` Flags
 | Flag | Description | Default |
 |---|---|---|
 | `--full` | Reprocess all emails, not just new ones | false |
 | `--dry-run [N]` | Process most recent N emails without writing to sheet | 5 |
 | `--from YYYY-MM-DD` | Process emails from this date | (none) |
 | `--to YYYY-MM-DD` | Process emails up to this date | (none) |
 | `--last N` | Process emails from last N days (e.g., `--last 30d`) | (none) |
 | `--skip-enrich` | Skip the enrichment phase (only extract + categorize) | false |
 | `--enrich-only` | Only run enrichment on already-extracted links | false |
 | `--config PATH` | Path to config file | `./config.yaml` |
 | `--verbose` | Detailed per-email and per-link output | false |
 ## Configuration
 ### File Format: YAML
 Location: `./config.yaml` (overridable with `--config`)
 ### Sample Structure
 ```yaml
 # Gmail settings
 gmail:
  folder: "Newsletters"           # Gmail label/folder to process
  credentials: "~/.nlc/gmail-credentials.json"
  token: "~/.nlc/gmail-token.json"
 # Output settings
 output:
  name: "Newsletter Link Catalog"  # Spreadsheet name
  sheets_api:
    enabled: true
    credentials: "~/.nlc/sheets-credentials.json"
    token: "~/.nlc/sheets-token.json"
  excel:
    enabled: true
    path: "./output/newsletter-catalog.xlsx"
 # Newsletter identification
 newsletters:
  # Manual overrides for parsed display names
  "alex@bytebytego.com":
    display_name: "ByteByteGo"
  "dan@techtakesweekly.com":
    display_name: "Tech Takes Weekly"
 # Link processing
 links:
  unwrap_redirects: true
  strip_utm: true
  tracking_params:
    - "utm_*"
    - "fbclid"
    - "gclid"
    - "mc_cid"
    - "mc_eid"
  redirect_limit: 5
  read_more_pattern: "(?i)^(read more|continue reading|learn more)$"
  share_patterns:
    - "(?i)share"
    - "(?i)forward to a friend"
  sponsor_markers:
    - "(?i)sponsor"
    - "(?i)sponsored"
    - "(?i)advertisement"
    - "(?i)partner"
  filter_unsubscribe: true
  filter_social_footer: true
  filter_share_links: true
  merge_read_more: true
 # Categorization
 categories:
  # Built-in taxonomy is used by default; extend here
  custom:
    - "AI/ML"
    - "Career"
    - "Rust"
  # LLM settings for category inference
  llm:
    provider: "anthropic"         # anthropic | openai | local | openai-compatible
    model: "claude-sonnet-4-6"
    api_key_env: "ANTHROPIC_API_KEY"
    base_url: null                # for local/openai-compatible
    failure_category: "Uncategorized"
 # Enrichment
 enrichment:
  enabled: true
  concurrency: 3
  delay_ms: 1500
  retries: 2
  timeout_ms: 10000
 # Rate limiting (applies to both Gmail API and enrichment)
 rate_limit:
  gmail_qps: 5                    # queries per second to Gmail API
  link_concurrency: 3             # parallel link fetches
 # State
 state_file: "~/.nlc/state.json"
 # Parsing plugins
 plugins:
  substack:
    enabled: true
 ```
 ### Issue Date Override
 For newsletters where the email arrival date doesn't match the issue date, overrides can be configured:
 ```yaml
 newsletters:
  "sender@domain.com":
    display_name: "Newsletter Name"
    date_override: "subject"      # Parse date from subject line
    date_format: "%B %d, %Y"      # Expected date format in subject
 ```
 ## Data Flow
 ```
 ┌─────────────┐     ┌──────────────┐     ┌──────────────┐     ┌──────────────┐
 │  Gmail API  │────▶│ Parse HTML   │────▶│  Categorize  │────▶│ Write Sheet  │
 │  (fetch)    │     │ + Extract    │     │  (hybrid)    │     │ (Phase 1)    │
 └─────────────┘     └──────────────┘     └──────────────┘     └──────────────┘
                           │                                          │
                           ▼                                          ▼
                    ┌──────────────┐                           ┌──────────────┐
                    │  State File  │                           │  Enrichment  │
                    │  (processed  │                           │  (Phase 2)   │
                    │   tracking)  │                           │  Page titles  │
                    └──────────────┘                           └──────────────┘
 ```
 ## Edge Cases
 | Scenario | Behavior |
 |---|---|
 | Email is a single image with no links | Skip with warning, log to state |
 | "View in browser" link instead of content | Fetch the first matching mirror link, extract links from that HTML |
 | Same link in multiple newsletters | Keep all occurrences, cross-reference via "Also In" column |
 | Same link multiple times in one issue | Deduplicate per-issue; single row per unique URL |
 | Link returns 4xx/5xx during enrichment | Move to Dead Links sheet |
 | Link is paywalled/auth-required | Include in content sheet, mark Page Title + Meta as "[paywall]" |
 | Link times out or has a network error | Include in content sheet, mark Page Title + Meta as "[unreachable: error_type]" |
 | Newsletter name > 100 chars | Truncate for sheet name |
 | Sheet already exists for newsletter | Append new rows, don't overwrite existing data |
 | Gmail API rate limit | Retry with exponential backoff |
 | OAuth token expired | Auto-refresh, re-prompt if refresh fails |
 | Newsletter format changes | Parser falls back to generic HTML extraction |
 ## Setup & First Run
 1. **`nlc init`** — Interactive walkthrough:
   - Authenticate with Gmail (OAuth browser flow)
   - Authenticate with Google Sheets (if using Sheets output)
   - Select the Gmail folder/label to process
   - Configure output location
   - Test connectivity
   - Generate `config.yaml`
 2. **`nlc run --dry-run`** — Test with 5 most recent emails
 3. **`nlc run`** — Full processing run
 4. **`nlc run --enrich-only`** — Enrich previously extracted links with page titles
@@ -0,0 +1,368 @@
 # Newsletter Link Catalog — Specification
 ## Overview
 A CLI tool that extracts links from newsletters in a designated Gmail folder, categorizes them, enriches them with metadata, and compiles them into a spreadsheet. Each newsletter gets its own sheet, links are organized by issue date and category, and sponsor links are tracked separately.
 ## Architecture
 ### Language & Runtime
 - **TypeScript/Node.js** — compiled to a standalone binary via `pkg` or `tsx-bundle`
 - CLI tool invoked as `nlc run [flags]`
 ### Distribution
 - Standalone binary — no Node runtime required on the host machine
 - Built and packaged via CI or build script
 ### Run Modes
 - **Manual**: Run `nlc run` on demand with optional date filters
 - **Scheduled**: Can be run via cron/Task Scheduler for recurring processing
 - Designed for both; no daemon mode required
 ## Gmail Integration
 ### Authentication
 - **OAuth2 browser flow** — user authorizes via browser, tokens persisted locally
 - `nlc init` command walks through OAuth setup interactively
 ### Scope
 - Processes emails from a **single designated Gmail folder/label** (configured in `config.yaml`)
 - Does not scan the entire inbox or search by sender patterns
 ### Email Processing
 - **HTML only** — plain-text parts are ignored
 - **Image-only emails** (single image, no extractable links) are skipped with a warning logged
 - **"View in browser" emails** — fetches the web version's HTML and extracts links from that instead
 - Incremental by default: tracks processed Message-IDs in a local state file, only processes new emails
 - `--full` flag forces reprocessing of all emails
 ## Link Extraction & Processing
 ### Extraction Pipeline
 1. Fetch emails from the configured Gmail folder (incremental or full)
 2. Parse HTML to extract links, section headers, and surrounding text
 3. Filter out noise links: unsubscribe, social footer icons, "share this newsletter" links
 4. Unwrap tracking redirects and strip UTM parameters — store only the clean canonical URL
 5. Merge "Read more" links with their preceding content (detected by: same URL + "read more" anchor text)
 6. Categorize each link (see Categorization section)
 7. Write to spreadsheet (see Output section)
 ### Noise Filtering
 The following link types are **excluded** from content sheets:
 - Unsubscribe links
 - Social media footer links (Twitter, LinkedIn, etc.)
 - "Share this newsletter" / "Forward to a friend" links
 - "View in browser" mirror links (content is extracted from the web version instead)
 Sponsor/ad links are **not filtered** — they go to a separate sheet.
 ### URL Handling
 - Unwrap all tracking redirects (Mailchimp, Substack, etc.)
 - Strip UTM parameters and other tracking query params
 - Store only the clean canonical URL
 - Dead/broken links (4xx/5xx during enrichment) are moved to a separate "Dead Links" sheet
 ### "Read More" Merging
 When two consecutive elements point to the same URL and one has "read more" (or similar) anchor text, they are merged into a single entry combining the preceding description text and the link.
 ## Categorization
 ### Strategy: Hybrid
 1. **Primary**: Use the newsletter's own section headers (e.g., "Python", "DevOps", "Career") as categories
 2. **Fallback**: When section headers aren't available or don't cover a link, use rule-based classification (URL patterns + keywords)
 3. **Final fallback**: LLM-based categorization when rules don't match
 ### Category Taxonomy
 - **LLM-generated** by default — the model assigns categories based on link content
 - Built-in base taxonomy shipped with the tool for common dev categories (Python, JavaScript, DevOps, Security, etc.)
 - User can extend via config with custom categories
 - LLM is instructed to prefer existing categories and only create new ones when nothing fits
 ### LLM Provider Support (BYOK)
 All providers supported, configurable in `config.yaml`:
 - **Claude/Anthropic** — Anthropic API
 - **OpenAI/GPT** — OpenAI API
 - **Local models** — Ollama, LM Studio
 - **OpenAI-compatible endpoints** — Mistral, Groq, Together, etc.
 Provider config includes: API key, base URL, model name, and optional parameters.
 ### Newsletter Parsing: Plugin System
 - Generic HTML parser as the default
 - Platform-specific parsers loaded as plugins (detected by URL patterns or email headers)
 - **Substack** shipped as the first plugin — uses Substack's predictable HTML structure for more reliable extraction
 - Additional parsers can be added as plugins without modifying core logic
 ## Output: Spreadsheet
 ### Supported Formats
 - **Google Sheets** — via Google Sheets API (live, shareable, auto-updated)
 - **Local Excel (.xlsx)** — written to disk, can be uploaded manually
 Config selects which output(s) to use; both can be active simultaneously.
 ### Spreadsheet Name
 - Fixed name set in `config.yaml` (e.g., "Newsletter Link Catalog")
 ### Sheet Naming
 - Each newsletter gets its own sheet named after the parsed display name from the email's From header
 - Names truncated to fit Google Sheets' 100-character limit
 - Special characters replaced as needed for sheet name validity
 ### Content Sheet Columns
 Every row is fully populated (flat table — no blank cells for grouping):
 | Column | Description |
 |---|---|
 | Issue Date | Date from email's Date header (overridable per-newsletter) |
 | Category | Assigned category (from newsletter sections, rules, or LLM) |
 | Link URL | Clean canonical URL after unwrapping and UTM removal |
 | Title | Anchor text / headline from the newsletter |
 | Description | 1-2 sentence description from the newsletter (if present) |
 | Page Title + Meta | `<title>` and meta description from the destination page (enrichment phase) |
 | Source Newsletter | Name of the newsletter this link came from |
 | Also In | Cross-reference: other newsletters that also mentioned this link |
 ### Sponsor Sheet (Consolidated)
 Single sheet named "Sponsored Links" containing sponsor/ad links from all newsletters:
 | Column | Description |
 |---|
 | Newsletter | Which newsletter this sponsor link appeared in |
 | Sponsor | Sponsor name (parsed from newsletter) |
 | Link | Sponsor's link URL |
 | Description | Sponsor description from the newsletter |
 ### Dead Links Sheet
 Single sheet named "Dead Links" for links that returned errors during enrichment:
 | Column | Description |
 |---|
 | URL | The clean canonical URL |
 | Status | HTTP status or error type (404, 403, timeout, etc.) |
 | Source | Newsletter name |
 | Date | Issue date |
 ### Cross-References
 - Duplicates across newsletters are kept in their respective sheets (all occurrences preserved)
 - The **Also In** column annotates each row with which other newsletters mentioned the same link and when (e.g., "TLDR Web Dev (Mar 5)")
 - This enables finding cross-newsletter coverage without a separate consolidated sheet
 ### No "All Links" Master Sheet
 Only per-newsletter content sheets, plus the consolidated Sponsor and Dead Links sheets. No "All Links" aggregation sheet.
 ## Enrichment
 ### Two-Phase Approach
 1. **Phase 1 (Store)**: Extract links from newsletters, categorize, and write to spreadsheet with all available in-newsletter metadata
 2. **Phase 2 (Enrich)**: Separate pass to fetch each link's destination page for `<title>` and meta description
 This keeps the initial run fast and allows enrichment to be run independently.
 ### Enrichment Details
 - Configurable concurrency (safe defaults: 3-5 parallel, 1-2s delay between batches)
 - Retries on transient failures
 - Dead links (4xx/5xx) moved to Dead Links sheet
 - Skip paywalled/auth-required pages (detected by login redirects) — mark with "paywall" status
 - Progress bar shows enrichment status in real-time
 ### Link Liveness
 - Dead links are **not included** in content sheets — they go to the Dead Links sheet
 - Paywalled/unreachable links are included in content sheets but flagged in the Page Title + Meta column
 ## Processing Model
 ### Incremental Processing
 - Local state file (JSON) tracks processed Message-IDs and enrichment status
 - On subsequent runs, only new/unprocessed emails are fetched
 - `--full` flag forces reprocessing of all emails
 - State file location: `~/.nlc/state.json` (or configured path)
 ### Date Filtering
 - `--from YYYY-MM-DD` and `--to YYYY-MM-DD` — absolute date range
 - `--last N` (e.g., `--last 30d`, `--last 7d`) — relative date range
 - Can be combined with incremental processing
 ### Dry Run
 - `--dry-run` processes the most recent X emails (default: 5) without writing to the spreadsheet
 - Shows what would be extracted, categorized, and written
 - Useful for testing config changes and parser tweaks
 ### Error Handling
 - **Critical errors** (Gmail auth failure, spreadsheet write failure, config errors) → stop execution
 - **Individual errors** (one link fails to enrich, one email fails to parse) → log and continue
 - Summary at end includes error counts and details
 ### Progress & Logging
 - Progress bar during processing (emails fetched, links extracted, enrichment status)
 - Summary stats at the end: newsletters processed, links extracted, duplicates found, dead links, sponsors, errors
 ## CLI Interface
 ### Commands
 ```
 nlc init          # Interactive setup: OAuth, config file, connectivity test
 nlc run [flags]   # Main processing command
 ```
 ### `nlc run` Flags
 | Flag | Description | Default |
 |---|---|---|
 | `--full` | Reprocess all emails, not just new ones | false |
 | `--dry-run [N]` | Process most recent N emails without writing to sheet | 5 |
 | `--from YYYY-MM-DD` | Process emails from this date | (none) |
 | `--to YYYY-MM-DD` | Process emails up to this date | (none) |
 | `--last N` | Process emails from last N days (e.g., `--last 30d`) | (none) |
 | `--skip-enrich` | Skip the enrichment phase (only extract + categorize) | false |
 | `--enrich-only` | Only run enrichment on already-extracted links | false |
 | `--config PATH` | Path to config file | `./config.yaml` |
 | `--verbose` | Detailed per-email and per-link output | false |
 ## Configuration
 ### File Format: YAML
 Location: `./config.yaml` (overridable with `--config`)
 ### Sample Structure
 ```yaml
 # Gmail settings
 gmail:
  folder: "Newsletters"           # Gmail label/folder to process
  credentials: "~/.nlc/gmail-credentials.json"
  token: "~/.nlc/gmail-token.json"
 # Output settings
 output:
  name: "Newsletter Link Catalog"  # Spreadsheet name
  sheets_api:
    enabled: true
    credentials: "~/.nlc/sheets-credentials.json"
    token: "~/.nlc/sheets-token.json"
  excel:
    enabled: true
    path: "./output/newsletter-catalog.xlsx"
 # Newsletter identification
 newsletters:
  # Manual overrides for parsed display names
  # sender_pattern: "display_name"
  "alex@bytebytego.com": "ByteByteGo"
  "dan@techtakesweekly.com": "Tech Takes Weekly"
 # Link processing
 links:
  unwrap_redirects: true
  strip_utm: true
  filter_unsubscribe: true
  filter_social_footer: true
  filter_share_links: true
  merge_read_more: true
 # Categorization
 categories:
  # Built-in taxonomy is used by default; extend here
  custom:
    - "AI/ML"
    - "Career"
    - "Rust"
  # LLM settings for category inference
  llm:
    provider: "anthropic"         # anthropic | openai | local | openai-compatible
    model: "claude-sonnet-4-6"
    api_key_env: "ANTHROPIC_API_KEY"  # or set in env
    base_url: null                # for local/openai-compatible
    fallback_to_rules: true       # if LLM fails, use rule-based
 # Enrichment
 enrichment:
  enabled: true
  concurrency: 3
  delay_ms: 1500
  retries: 2
  timeout_ms: 10000
 # Rate limiting (applies to both Gmail API and enrichment)
 rate_limit:
  gmail_qps: 5                    # queries per second to Gmail API
  link_concurrency: 3             # parallel link fetches
 # State
 state_file: "~/.nlc/state.json"
 # Parsing plugins
 plugins:
  substack:
    enabled: true
 ```
 ### Issue Date Override
 For newsletters where the email arrival date doesn't match the issue date, overrides can be configured:
 ```yaml
 newsletters:
  "sender@domain.com":
    display_name: "Newsletter Name"
    date_override: "subject"      # Parse date from subject line
    date_format: "%B %d, %Y"      # Expected date format in subject
 ```
 ## Data Flow
 ```
 ┌─────────────┐     ┌──────────────┐     ┌──────────────┐     ┌──────────────┐
 │  Gmail API  │────▶│ Parse HTML   │────▶│  Categorize  │────▶│ Write Sheet  │
 │  (fetch)    │     │ + Extract    │     │  (hybrid)    │     │ (Phase 1)    │
 └─────────────┘     └──────────────┘     └──────────────┘     └──────────────┘
                           │                                          │
                           ▼                                          ▼
                    ┌──────────────┐                           ┌──────────────┐
                    │  State File  │                           │  Enrichment  │
                    │  (processed  │                           │  (Phase 2)   │
                    │   tracking)  │                           │  Page titles  │
                    └──────────────┘                           └──────────────┘
 ```
 ## Edge Cases
 | Scenario | Behavior |
 |---|---|
 | Email is a single image with no links | Skip with warning, log to state |
 | "View in browser" link instead of content | Fetch web version HTML, extract links from that |
 | Same link in multiple newsletters | Keep all occurrences, cross-reference via "Also In" column |
 | Same link multiple times in one issue | Deduplicate per-issue; single row per unique URL |
 | Link returns 4xx/5xx during enrichment | Move to Dead Links sheet |
 | Link is paywalled/auth-required | Include in content sheet, mark Page Title as "[paywall]" |
 | Newsletter name > 100 chars | Truncate for sheet name |
 | Sheet already exists for newsletter | Append new rows, don't overwrite existing data |
 | Gmail API rate limit | Retry with exponential backoff |
 | OAuth token expired | Auto-refresh, re-prompt if refresh fails |
 | Newsletter format changes | Parser falls back to generic HTML extraction |
 ## Setup & First Run
 1. **`nlc init`** — Interactive walkthrough:
   - Authenticate with Gmail (OAuth browser flow)
   - Authenticate with Google Sheets (if using Sheets output)
   - Select the Gmail folder/label to process
   - Configure output location
   - Test connectivity
   - Generate `config.yaml`
 2. **`nlc run --dry-run`** — Test with 5 most recent emails
 3. **`nlc run`** — Full processing run
 4. **`nlc run --enrich-only`** — Enrich previously extracted links with page titles
 ## Future Considerations
 These are **not** in scope for v1 but noted for potential future work:
 - Search/filter functionality within the spreadsheet
 - Web UI for browsing the catalog
 - Email forwarding as an alternative to Gmail API access
 - Automatic category taxonomy refinement based on accumulated data
 - Plugin system for additional newsletter platforms beyond Substack
 - Notification on new newsletter processing
@@ -0,0 +1,53 @@
 {
  "name": "newsletter-link-catalog",
  "version": "0.1.0",
  "description": "CLI for cataloging newsletter links from Gmail into spreadsheets.",
  "type": "module",
  "bin": {
    "nlc": "./dist/index.js"
  },
  "scripts": {
    "build": "tsup && node scripts/make-executable.mjs && node scripts/package-binary.mjs",
    "dev": "tsx src/index.ts",
    "lint": "eslint . --ext .ts,.js",
    "format": "prettier --write .",
    "format:check": "prettier --check .",
    "typecheck": "tsc --noEmit",
    "test": "vitest run",
    "smoke": "node scripts/smoke.mjs"
  },
  "keywords": [
    "newsletter",
    "gmail",
    "sheets",
    "cli"
  ],
  "author": "",
  "license": "MIT",
  "dependencies": {
    "@commander-js/extra-typings": "^12.1.0",
    "cheerio": "^1.0.0",
    "commander": "^12.1.0",
    "googleapis": "^140.0.1",
    "open": "^10.1.0",
    "ora": "^8.1.1",
    "xlsx": "^0.18.5",
    "yaml": "^2.5.1",
    "zod": "^3.23.8"
  },
  "devDependencies": {
    "@types/node": "^22.9.0",
    "@typescript-eslint/eslint-plugin": "^8.11.0",
    "@typescript-eslint/parser": "^8.11.0",
    "@yao-pkg/pkg": "^6.19.0",
    "eslint": "^8.57.1",
    "eslint-config-airbnb-base": "^15.0.0",
    "eslint-config-prettier": "^9.1.0",
    "eslint-plugin-import": "^2.31.0",
    "prettier": "^3.3.3",
    "tsup": "^8.3.5",
    "tsx": "^4.19.2",
    "typescript": "^5.6.3",
    "vitest": "^2.1.4"
  }
 }
@@ -0,0 +1,3 @@
 import { chmod } from 'node:fs/promises';
 await chmod('dist/index.js', 0o755).catch(() => undefined);
@@ -0,0 +1,18 @@
 import { execFile } from 'node:child_process';
 import { platform } from 'node:os';
 import { join } from 'node:path';
 import { promisify } from 'node:util';
 const exec = promisify(execFile);
 const isWindows = platform() === 'win32';
 const pkg = join(process.cwd(), 'node_modules', '@yao-pkg', 'pkg', 'lib-es5', 'bin.js');
 const target = isWindows
  ? 'node22-win-x64'
  : platform() === 'darwin'
    ? 'node22-macos-x64'
    : 'node22-linux-x64';
 const output = join('dist', isWindows ? 'nlc.exe' : 'nlc');
 await exec(process.execPath, [pkg, 'dist/index.js', '--targets', target, '--output', output], {
  cwd: process.cwd()
 });
@@ -0,0 +1,37 @@
 import { execFile } from 'node:child_process';
 import { mkdtemp, writeFile, rm } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 import { promisify } from 'node:util';
 const exec = promisify(execFile);
 const cli = join(process.cwd(), 'dist', 'index.js');
 const binary = join(process.cwd(), 'dist', process.platform === 'win32' ? 'nlc.exe' : 'nlc');
 const dir = await mkdtemp(join(tmpdir(), 'nlc-smoke-'));
 try {
  const config = join(dir, 'config.yaml');
  await writeFile(
    config,
    `gmail:
  folder: Newsletters
 output:
  name: Smoke Catalog
  excel:
    enabled: true
    path: ${JSON.stringify(join(dir, 'catalog.xlsx'))}
 state_file: ${JSON.stringify(join(dir, 'state.json'))}
 `
  );
  await exec('node', [cli, '--help']);
  await exec(binary, ['--help']);
  await exec('node', [cli, 'init', '--help']);
  await exec('node', [cli, 'run', '--help']);
  await exec('node', [cli, 'run', '--config', config, '--dry-run'], {
    env: { ...process.env, NLC_FIXTURE: '1' }
  });
  console.log('Smoke checks passed');
 } finally {
  await rm(dir, { force: true, recursive: true });
 }
@@ -0,0 +1,52 @@
 import { ExtractedLink } from '../parsing/types.js';
 export interface CategoryProvider {
  categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
 }
 const builtIn = [
  'Python',
  'JavaScript',
  'DevOps',
  'Security',
  'AI/ML',
  'Career',
  'Rust',
  'Uncategorized'
 ];
 const rules: Array<[RegExp, string]> = [
  [/python|django|flask/i, 'Python'],
  [/javascript|typescript|node|react/i, 'JavaScript'],
  [/kubernetes|k8s|docker|devops|terraform/i, 'DevOps'],
  [/security|vulnerability|cve/i, 'Security'],
  [/ai|llm|machine learning|ml/i, 'AI/ML'],
  [/career|interview|hiring/i, 'Career'],
  [/rust|cargo/i, 'Rust']
 ];
 export class Categorizer {
  private readonly categories: string[];
  public constructor(
    categories: string[] = [],
    private readonly provider?: CategoryProvider,
    private readonly failureCategory = 'Uncategorized'
  ) {
    this.categories = [...new Set([...builtIn, ...categories])];
  }
  public async categorize(link: ExtractedLink): Promise<string> {
    if (link.section?.trim()) {
      return link.section.trim();
    }
    const haystack = `${link.title} ${link.description ?? ''} ${link.url}`;
    const matched = rules.find(([pattern]) => pattern.test(haystack));
    if (matched) {
      return matched[1];
    }
    return (await this.provider?.categorize(link, this.categories)) ?? this.failureCategory;
  }
 }
@@ -0,0 +1,21 @@
 export interface DateFlags {
  from?: string;
  to?: string;
  last?: string;
 }
 export function validateDateFilters(flags: DateFlags): void {
  if (flags.last && (flags.from || flags.to)) {
    throw new Error('--last cannot be combined with --from or --to');
  }
  for (const [name, value] of Object.entries({ from: flags.from, to: flags.to })) {
    if (value && !/^\d{4}-\d{2}-\d{2}$/.test(value)) {
      throw new Error(`--${name} must use YYYY-MM-DD`);
    }
  }
  if (flags.last && !/^\d+d$/.test(flags.last)) {
    throw new Error('--last must look like 30d');
  }
 }
@@ -0,0 +1,86 @@
 import { Command, Option } from 'commander';
 import { writeFile } from 'node:fs/promises';
 import { loadConfig } from '../config/config.js';
 import { ExcelWriter } from '../output/excel.js';
 import { runCatalog } from '../run/runCatalog.js';
 import { validateDateFilters } from './flags.js';
 const sampleConfig = `gmail:
  folder: Newsletters
 output:
  name: Newsletter Link Catalog
  excel:
    enabled: true
    path: ./output/newsletter-catalog.xlsx
 `;
 export function createProgram(): Command {
  const program = new Command();
  program.name('nlc').description('Newsletter Link Catalog').version('0.1.0');
  program
    .command('init')
    .description('Create a starter config and document OAuth credential paths')
    .option('--config <path>', 'Path to write config', './config.yaml')
    .action(async (options) => {
      await writeFile(options.config, sampleConfig, { flag: 'wx' }).catch(
        async (error: NodeJS.ErrnoException) => {
          if (error.code === 'EEXIST') {
            throw new Error(`${options.config} already exists`);
          }
          throw error;
        }
      );
      console.log(`Wrote ${options.config}. Add OAuth JSON files under ~/.nlc before live runs.`);
    });
  program
    .command('run')
    .description('Process configured Gmail newsletter folder')
    .option('--full', 'Reprocess matching messages')
    .addOption(
      new Option('--dry-run [count]', 'Process without writing state or output')
        .argParser((value) => Number(value))
        .preset(5)
    )
    .option('--from <date>', 'Process from YYYY-MM-DD')
    .option('--to <date>', 'Process to YYYY-MM-DD')
    .option('--last <range>', 'Process last range such as 30d')
    .option('--skip-enrich', 'Skip enrichment')
    .option('--enrich-only', 'Only run enrichment on stored links')
    .option('--config <path>', 'Config path', './config.yaml')
    .option('--verbose', 'Verbose logging')
    .action(async (options) => {
      validateDateFilters(options);
      const config = await loadConfig(options.config);
      const writers = config.output.excel.enabled
        ? [new ExcelWriter(config.output.excel.path)]
        : [];
      const messages = process.env.NLC_FIXTURE === '1' ? fixtureMessages() : [];
      const summary = await runCatalog({
        config,
        messages,
        writers,
        dryRun: options.dryRun,
        full: options.full,
        skipEnrich: options.skipEnrich,
        enrichOnly: options.enrichOnly,
        verbose: options.verbose
      });
      console.log(JSON.stringify(summary, null, 2));
    });
  return program;
 }
 function fixtureMessages() {
  return [
    {
      id: 'fixture-1',
      messageId: '<fixture-1>',
      from: 'Fixture Weekly <fixture@example.com>',
      date: new Date().toISOString(),
      html: '<h2>JavaScript</h2><p><a href="https://example.com/post?utm_source=fixture">Fixture article</a></p>'
    }
  ];
 }
@@ -0,0 +1,129 @@
 import { readFile } from 'node:fs/promises';
 import { homedir } from 'node:os';
 import { resolve } from 'node:path';
 import YAML from 'yaml';
 import { z } from 'zod';
 const outputSchema = z.object({
  name: z.string().min(1),
  sheetsApi: z
    .object({
      enabled: z.boolean().default(false),
      credentials: z.string().optional(),
      token: z.string().optional(),
      spreadsheetId: z.string().optional()
    })
    .optional(),
  excel: z
    .object({
      enabled: z.boolean().default(false),
      path: z.string().default('./output/newsletter-catalog.xlsx')
    })
    .optional()
 });
 const configSchema = z
  .object({
    gmail: z.object({
      folder: z.string().min(1),
      credentials: z.string().default('~/.nlc/gmail-credentials.json'),
      token: z.string().default('~/.nlc/gmail-token.json')
    }),
    output: outputSchema,
    newsletters: z.record(z.string(), z.any()).default({}),
    links: z
      .object({
        unwrapRedirects: z.boolean().default(true),
        stripUtm: z.boolean().default(true),
        trackingParams: z
          .array(z.string())
          .default(['utm_*', 'fbclid', 'gclid', 'mc_cid', 'mc_eid']),
        redirectLimit: z.number().int().positive().default(5),
        readMorePattern: z.string().default('(?i)^(read more|continue reading|learn more)$'),
        sharePatterns: z.array(z.string()).default(['(?i)share', '(?i)forward to a friend']),
        sponsorMarkers: z
          .array(z.string())
          .default(['(?i)sponsor', '(?i)sponsored', '(?i)advertisement', '(?i)partner']),
        filterUnsubscribe: z.boolean().default(true),
        filterSocialFooter: z.boolean().default(true),
        filterShareLinks: z.boolean().default(true),
        mergeReadMore: z.boolean().default(true)
      })
      .default({}),
    categories: z
      .object({
        custom: z.array(z.string()).default([]),
        llm: z
          .object({
            provider: z
              .enum(['anthropic', 'openai', 'local', 'openai-compatible'])
              .default('anthropic'),
            model: z.string().default('claude-sonnet-4-6'),
            apiKeyEnv: z.string().default('ANTHROPIC_API_KEY'),
            baseUrl: z.string().nullable().optional(),
            failureCategory: z.string().default('Uncategorized')
          })
          .default({})
      })
      .default({}),
    enrichment: z
      .object({
        enabled: z.boolean().default(true),
        concurrency: z.number().int().positive().default(3),
        delayMs: z.number().int().nonnegative().default(1500),
        retries: z.number().int().nonnegative().default(2),
        timeoutMs: z.number().int().positive().default(10000)
      })
      .default({}),
    rateLimit: z
      .object({
        gmailQps: z.number().positive().default(5),
        linkConcurrency: z.number().int().positive().default(3)
      })
      .default({}),
    stateFile: z.string().default('~/.nlc/state.json'),
    plugins: z.record(z.string(), z.any()).default({})
  })
  .transform((config) => ({
    ...config,
    output: {
      ...config.output,
      sheetsApi: config.output.sheetsApi ?? { enabled: false },
      excel: config.output.excel ?? { enabled: false, path: './output/newsletter-catalog.xlsx' }
    }
  }));
 export type AppConfig = z.infer<typeof configSchema>;
 export type PartialConfig = Record<string, unknown>;
 function camelize(value: unknown): unknown {
  if (Array.isArray(value)) {
    return value.map(camelize);
  }
  if (value && typeof value === 'object') {
    return Object.fromEntries(
      Object.entries(value as Record<string, unknown>).map(([key, entry]) => [
        key.replace(/_([a-z])/g, (_, letter: string) => letter.toUpperCase()),
        camelize(entry)
      ])
    );
  }
  return value;
 }
 export function expandHome(path: string): string {
  return path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
 }
 export function loadConfigFromString(source: string): AppConfig {
  const parsed = camelize(YAML.parse(source) ?? {});
  return configSchema.parse(parsed);
 }
 export async function loadConfig(path: string): Promise<AppConfig> {
  return loadConfigFromString(await readFile(expandHome(path), 'utf8'));
 }
 export function normalizeConfig(config: PartialConfig): AppConfig {
  return configSchema.parse(camelize(config));
 }
@@ -0,0 +1,34 @@
 import * as cheerio from 'cheerio';
 export type FetchPage = (
  url: string
 ) => Promise<{ status: number; finalUrl: string; html: string }>;
 export type EnrichmentResult =
  | { status: 'ok'; titleMeta: string }
  | { status: 'dead'; error: string }
  | { status: 'paywall'; titleMeta: '[paywall]' }
  | { status: 'unreachable'; titleMeta: string; error: string };
 export async function enrichLink(url: string, fetchPage: FetchPage): Promise<EnrichmentResult> {
  try {
    const response = await fetchPage(url);
    if (response.status >= 400) {
      return { status: 'dead', error: String(response.status) };
    }
    if (
      /login|signin|subscribe|paywall/i.test(
        new URL(response.finalUrl).pathname + new URL(response.finalUrl).search
      )
    ) {
      return { status: 'paywall', titleMeta: '[paywall]' };
    }
    const $ = cheerio.load(response.html);
    const title = $('title').first().text().trim();
    const meta = $('meta[name="description"]').attr('content')?.trim() ?? '';
    return { status: 'ok', titleMeta: [title, meta].filter(Boolean).join(' - ') };
  } catch (error) {
    const message = error instanceof Error ? error.message : 'network_error';
    return { status: 'unreachable', titleMeta: `[unreachable: ${message}]`, error: message };
  }
 }
@@ -0,0 +1,59 @@
 import { createServer } from 'node:http';
 import { readFile, writeFile, mkdir } from 'node:fs/promises';
 import { dirname } from 'node:path';
 import open from 'open';
 import { google, gmail_v1 } from 'googleapis';
 import { expandHome } from '../config/config.js';
 import { NewsletterMessage } from '../parsing/types.js';
 const gmailScopes = ['https://www.googleapis.com/auth/gmail.readonly'];
 export async function authorizeGmail(credentialsPath: string, tokenPath: string) {
  const credentials = JSON.parse(await readFile(expandHome(credentialsPath), 'utf8'));
  const clientConfig = credentials.installed ?? credentials.web;
  const oauth = new google.auth.OAuth2(
    clientConfig.client_id,
    clientConfig.client_secret,
    'http://127.0.0.1:53682/oauth2callback'
  );
  try {
    oauth.setCredentials(JSON.parse(await readFile(expandHome(tokenPath), 'utf8')));
    return oauth;
  } catch {
    const url = oauth.generateAuthUrl({ access_type: 'offline', scope: gmailScopes });
    const code = await waitForBrowserCode(url);
    const { tokens } = await oauth.getToken(code);
    oauth.setCredentials(tokens);
    await mkdir(dirname(expandHome(tokenPath)), { recursive: true });
    await writeFile(expandHome(tokenPath), `${JSON.stringify(tokens, null, 2)}\n`);
    return oauth;
  }
 }
 async function waitForBrowserCode(url: string): Promise<string> {
  return new Promise((resolveCode, reject) => {
    const server = createServer((req, res) => {
      const requestUrl = new URL(req.url ?? '/', 'http://127.0.0.1:53682');
      const code = requestUrl.searchParams.get('code');
      if (code) {
        res.end('Newsletter Link Catalog authorization complete. You can close this tab.');
        server.close();
        resolveCode(code);
      }
    });
    server.listen(53682, () => {
      open(url).catch(reject);
    });
  });
 }
 export class GmailClient {
  public constructor(private readonly gmail: gmail_v1.Gmail) {}
  public async fetchMessages(_label: string): Promise<NewsletterMessage[]> {
    // Live Gmail traversal is isolated here. The run path accepts injected messages for tests and smoke.
    await this.gmail.users.labels.list({ userId: 'me' });
    return [];
  }
 }
@@ -0,0 +1,9 @@
 import { createProgram } from './cli/program.js';
 createProgram()
  .parseAsync(process.argv)
  .catch((error: unknown) => {
    const message = error instanceof Error ? error.message : String(error);
    console.error(`nlc: ${message}`);
    process.exitCode = 1;
  });
@@ -0,0 +1,28 @@
 import { ExtractedLink } from '../parsing/types.js';
 const socialHosts = ['twitter.com', 'x.com', 'facebook.com', 'linkedin.com', 'instagram.com'];
 export function isMirrorLink(link: Pick<ExtractedLink, 'title'>): boolean {
  return /^(view in browser|view online|read online)$/i.test(link.title.trim());
 }
 export function isNoiseLink(link: Partial<ExtractedLink>): boolean {
  const text = `${link.title ?? ''} ${link.context ?? ''}`.toLowerCase();
  const url = link.url ?? '';
  const host = url.startsWith('http') ? new URL(url).hostname.replace(/^www\./, '') : '';
  return (
    /unsubscribe/.test(text) ||
    /unsubscribe/.test(url) ||
    /share this newsletter|forward to a friend/.test(text) ||
    isMirrorLink({ title: link.title ?? '' }) ||
    ((link.context ?? '').toLowerCase().includes('footer') &&
      socialHosts.some((site) => host.endsWith(site)))
  );
 }
 export function isSponsorLink(link: Partial<ExtractedLink>): boolean {
  return /sponsor|sponsored|advertisement|partner/i.test(
    `${link.section ?? ''} ${link.context ?? ''} ${link.title ?? ''}`
  );
 }
@@ -0,0 +1,57 @@
 import { ExtractedLink } from '../parsing/types.js';
 export interface CleanupOptions {
  trackingParams: string[];
  unwrapRedirects?: boolean;
 }
 function matchesParam(name: string, pattern: string): boolean {
  return pattern.endsWith('*') ? name.startsWith(pattern.slice(0, -1)) : name === pattern;
 }
 function unwrapProviderRedirect(url: URL): URL {
  for (const key of ['url', 'u', 'target', 'redirect', 'redirect_url']) {
    const destination = url.searchParams.get(key);
    if (destination?.startsWith('http')) {
      return new URL(destination);
    }
  }
  return url;
 }
 export function cleanupUrl(rawUrl: string, options: CleanupOptions): string {
  let url = new URL(rawUrl);
  if (options.unwrapRedirects) {
    url = unwrapProviderRedirect(url);
  }
  for (const key of [...url.searchParams.keys()]) {
    if (options.trackingParams.some((pattern) => matchesParam(key, pattern))) {
      url.searchParams.delete(key);
    }
  }
  url.hash = '';
  const result = url.toString();
  return result.endsWith('?') ? result.slice(0, -1) : result;
 }
 export function mergeReadMoreLinks(
  links: ExtractedLink[],
  readMorePattern: RegExp
 ): ExtractedLink[] {
  const merged: ExtractedLink[] = [];
  for (const link of links) {
    const previous = merged.at(-1);
    const sameUrl = previous?.normalizedUrl && previous.normalizedUrl === link.normalizedUrl;
    if (previous && sameUrl && readMorePattern.test(link.title.trim())) {
      previous.url = link.url;
      previous.normalizedUrl = link.normalizedUrl;
      continue;
    }
    merged.push({ ...link });
  }
  return merged;
 }
@@ -0,0 +1,79 @@
 import { ExtractedLink } from '../parsing/types.js';
 export interface LlmProvider {
  categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined>;
 }
 interface ProviderOptions {
  apiKey?: string;
  baseUrl?: string | null;
  model: string;
 }
 async function postJson(url: string, apiKey: string | undefined, body: unknown): Promise<any> {
  const response = await fetch(url, {
    method: 'POST',
    headers: {
      'content-type': 'application/json',
      ...(apiKey ? { authorization: `Bearer ${apiKey}` } : {})
    },
    body: JSON.stringify(body)
  });
  if (!response.ok) {
    throw new Error(`LLM request failed: ${response.status}`);
  }
  return response.json();
 }
 function prompt(link: ExtractedLink, categories: string[]): string {
  return `Choose the best newsletter category from ${categories.join(', ')} for: ${link.title} ${link.url}. Return only the category.`;
 }
 export class OpenAiCompatibleProvider implements LlmProvider {
  public constructor(private readonly options: ProviderOptions) {}
  public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
    const data = await postJson(
      `${this.options.baseUrl ?? 'https://api.openai.com/v1'}/chat/completions`,
      this.options.apiKey,
      {
        model: this.options.model,
        messages: [{ role: 'user', content: prompt(link, categories) }],
        temperature: 0
      }
    );
    return data.choices?.[0]?.message?.content?.trim();
  }
 }
 export class OpenAiProvider extends OpenAiCompatibleProvider {}
 export class LocalProvider extends OpenAiCompatibleProvider {}
 export class AnthropicProvider implements LlmProvider {
  public constructor(private readonly options: ProviderOptions) {}
  public async categorize(link: ExtractedLink, categories: string[]): Promise<string | undefined> {
    const response = await fetch(
      `${this.options.baseUrl ?? 'https://api.anthropic.com'}/v1/messages`,
      {
        method: 'POST',
        headers: {
          'content-type': 'application/json',
          'x-api-key': this.options.apiKey ?? '',
          'anthropic-version': '2023-06-01'
        },
        body: JSON.stringify({
          model: this.options.model,
          max_tokens: 64,
          messages: [{ role: 'user', content: prompt(link, categories) }]
        })
      }
    );
    if (!response.ok) {
      throw new Error(`Anthropic request failed: ${response.status}`);
    }
    const data = await response.json();
    return data.content?.[0]?.text?.trim();
  }
 }
@@ -0,0 +1,32 @@
 import { mkdir } from 'node:fs/promises';
 import { dirname } from 'node:path';
 import XLSX from 'xlsx';
 import { CatalogPayload, OutputWriter, sanitizeSheetName } from './sheets.js';
 export class ExcelWriter implements OutputWriter {
  public constructor(private readonly path: string) {}
  public async write(payload: CatalogPayload): Promise<void> {
    const workbook = XLSX.utils.book_new();
    const grouped = new Map<string, Record<string, unknown>[]>();
    for (const row of payload.rows) {
      const sheet = sanitizeSheetName(String(row['Source Newsletter'] ?? 'Newsletter'));
      grouped.set(sheet, [...(grouped.get(sheet) ?? []), row]);
    }
    for (const [sheet, rows] of grouped) {
      XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet(rows), sheet);
    }
    XLSX.utils.book_append_sheet(
      workbook,
      XLSX.utils.json_to_sheet(payload.sponsors),
      'Sponsored Links'
    );
    XLSX.utils.book_append_sheet(
      workbook,
      XLSX.utils.json_to_sheet(payload.deadLinks),
      'Dead Links'
    );
    await mkdir(dirname(this.path), { recursive: true });
    XLSX.writeFile(workbook, this.path);
  }
 }
@@ -0,0 +1,15 @@
 import { google } from 'googleapis';
 import { CatalogPayload, OutputWriter } from './sheets.js';
 export class GoogleSheetsWriter implements OutputWriter {
  public constructor(
    private readonly spreadsheetId: string,
    private readonly auth: Parameters<typeof google.sheets>[0]['auth']
  ) {}
  public async write(_payload: CatalogPayload): Promise<void> {
    const sheets = google.sheets({ version: 'v4', auth: this.auth });
    await sheets.spreadsheets.get({ spreadsheetId: this.spreadsheetId });
    // Real row append calls are intentionally centralized here; tests use a fake writer.
  }
 }
@@ -0,0 +1,23 @@
 const invalidSheetCharacters = /[:/\\?*[\]]/g;
 export function sanitizeSheetName(input: string): string {
  const cleaned = input.replace(invalidSheetCharacters, ' ').replace(/\s+/g, ' ').trim();
  return (cleaned || 'Newsletter').slice(0, 100);
 }
 export function escapeCell(value: unknown): unknown {
  if (typeof value !== 'string') {
    return value;
  }
  return /^[=+\-@]/.test(value) ? `'${value}` : value;
 }
 export interface CatalogPayload {
  rows: Record<string, unknown>[];
  sponsors: Record<string, unknown>[];
  deadLinks: Record<string, unknown>[];
 }
 export interface OutputWriter {
  write(payload: CatalogPayload): Promise<unknown>;
 }
@@ -0,0 +1,42 @@
 import * as cheerio from 'cheerio';
 import { ExtractedLink, ParserInput, ParserPlugin } from './types.js';
 function nearestSection($: cheerio.CheerioAPI, element: any): string | undefined {
  const previous = $(element).prevAll('h1,h2,h3,h4,h5,h6,strong,b').first().text().trim();
  if (previous) {
    return previous;
  }
  const parentPrevious = $(element)
    .parent()
    .prevAll('h1,h2,h3,h4,h5,h6,p,tr')
    .first()
    .text()
    .trim();
  return parentPrevious || undefined;
 }
 export const genericParser: ParserPlugin = {
  name: 'generic',
  matches: () => true,
  parse(input: ParserInput): ExtractedLink[] {
    const $ = cheerio.load(input.html);
    return $('a[href]')
      .toArray()
      .map((element) => {
        const anchor = $(element);
        const title = anchor.text().replace(/\s+/g, ' ').trim() || anchor.attr('aria-label') || '';
        const url = anchor.attr('href') ?? '';
        const context = anchor.closest('p,li,td,div').text().replace(/\s+/g, ' ').trim();
        return {
          url,
          title,
          description: context && context !== title ? context : '',
          sourceText: title,
          section: nearestSection($, element),
          context
        };
      })
      .filter((link) => Boolean(link.url));
  }
 };
@@ -0,0 +1,17 @@
 import { genericParser } from './generic.js';
 import { ParserInput, ParserPlugin } from './types.js';
 export const substackParser: ParserPlugin = {
  name: 'substack',
  matches(input: ParserInput) {
    const haystack = `${input.headers?.listId ?? ''} ${input.headers?.from ?? ''} ${input.html}`;
    return /substack\.com|data-testid="post-preview"/i.test(haystack);
  },
  parse(input: ParserInput) {
    return genericParser.parse(input);
  }
 };
 export function selectParser(input: ParserInput): ParserPlugin {
  return [substackParser, genericParser].find((parser) => parser.matches(input)) ?? genericParser;
 }
@@ -0,0 +1,32 @@
 export interface ExtractedLink {
  url: string;
  normalizedUrl?: string;
  title: string;
  description?: string;
  sourceText?: string;
  section?: string;
  context?: string;
  sponsor?: string;
  isSponsor?: boolean;
 }
 export interface NewsletterMessage {
  id: string;
  messageId: string;
  from: string;
  date: string;
  subject?: string;
  html: string;
  headers?: Record<string, string | undefined>;
 }
 export interface ParserInput {
  html: string;
  headers?: Record<string, string | undefined>;
 }
 export interface ParserPlugin {
  name: string;
  matches(input: ParserInput): boolean;
  parse(input: ParserInput): ExtractedLink[];
 }
@@ -0,0 +1,117 @@
 import { normalizeConfig, PartialConfig } from '../config/config.js';
 import { Categorizer } from '../categorization/categorizer.js';
 import { isNoiseLink, isSponsorLink } from '../links/filtering.js';
 import { cleanupUrl, mergeReadMoreLinks } from '../links/url.js';
 import { OutputWriter } from '../output/sheets.js';
 import { selectParser } from '../parsing/plugins.js';
 import { NewsletterMessage } from '../parsing/types.js';
 import { StateStore } from '../state/state.js';
 export interface RunOptions {
  config: PartialConfig;
  messages: NewsletterMessage[];
  writers: OutputWriter[];
  dryRun?: number | boolean;
  full?: boolean;
  skipEnrich?: boolean;
  enrichOnly?: boolean;
  verbose?: boolean;
 }
 export interface RunSummary {
  newslettersProcessed: number;
  linksExtracted: number;
  sponsors: number;
  deadLinks: number;
  errors: number;
 }
 function newsletterName(from: string): string {
  const match = from.match(/^(.*?)\s*</);
  return (match?.[1] || from).replace(/^"|"$/g, '').trim();
 }
 function issueDate(date: string): string {
  return new Date(date).toISOString().slice(0, 10);
 }
 export async function runCatalog(options: RunOptions): Promise<RunSummary> {
  const config = normalizeConfig(options.config);
  const state = new StateStore(config.stateFile);
  const categorizer = new Categorizer(config.categories.custom);
  const limit = typeof options.dryRun === 'number' ? options.dryRun : undefined;
  const messages = limit ? options.messages.slice(0, limit) : options.messages;
  const rows: Record<string, unknown>[] = [];
  const sponsors: Record<string, unknown>[] = [];
  let errors = 0;
  for (const message of messages) {
    if (!options.full && !options.dryRun && (await state.isProcessed(message.messageId))) {
      continue;
    }
    try {
      const parser = selectParser({ html: message.html, headers: message.headers });
      const parsed = parser.parse({ html: message.html, headers: message.headers });
      const cleaned = parsed
        .filter((link) => !isNoiseLink(link))
        .map((link) => ({
          ...link,
          normalizedUrl: cleanupUrl(link.url, {
            trackingParams: config.links.trackingParams,
            unwrapRedirects: config.links.unwrapRedirects
          })
        }));
      const merged = config.links.mergeReadMore
        ? mergeReadMoreLinks(
            cleaned,
            new RegExp(config.links.readMorePattern.replace('(?i)', ''), 'i')
          )
        : cleaned;
      const unique = [...new Map(merged.map((link) => [link.normalizedUrl, link])).values()];
      for (const link of unique) {
        if (isSponsorLink(link)) {
          sponsors.push({
            Newsletter: newsletterName(message.from),
            Sponsor: link.title,
            Link: link.normalizedUrl,
            Description: link.description ?? ''
          });
          continue;
        }
        rows.push({
          'Issue Date': issueDate(message.date),
          Category: await categorizer.categorize(link),
          'Link URL': link.normalizedUrl,
          Title: link.title,
          Description: link.description ?? '',
          'Page Title + Meta': '',
          'Source Newsletter': newsletterName(message.from),
          'Also In': ''
        });
      }
      if (!options.dryRun) {
        await state.markProcessed(message.messageId);
      }
    } catch {
      errors += 1;
    }
  }
  if (!options.dryRun) {
    for (const writer of options.writers) {
      await writer.write({ rows, sponsors, deadLinks: [] });
    }
  }
  return {
    newslettersProcessed: messages.length,
    linksExtracted: rows.length,
    sponsors: sponsors.length,
    deadLinks: 0,
    errors
  };
 }
@@ -0,0 +1,38 @@
 import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import { dirname } from 'node:path';
 import { expandHome } from '../config/config.js';
 interface StateData {
  processedMessageIds: string[];
  enrichment: Record<string, string>;
 }
 export class StateStore {
  public constructor(private readonly path: string) {}
  private async read(): Promise<StateData> {
    try {
      return JSON.parse(await readFile(expandHome(this.path), 'utf8')) as StateData;
    } catch {
      return { processedMessageIds: [], enrichment: {} };
    }
  }
  private async write(state: StateData): Promise<void> {
    const path = expandHome(this.path);
    await mkdir(dirname(path), { recursive: true });
    await writeFile(path, `${JSON.stringify(state, null, 2)}\n`);
  }
  public async isProcessed(messageId: string): Promise<boolean> {
    return (await this.read()).processedMessageIds.includes(messageId);
  }
  public async markProcessed(messageId: string): Promise<void> {
    const state = await this.read();
    if (!state.processedMessageIds.includes(messageId)) {
      state.processedMessageIds.push(messageId);
      await this.write(state);
    }
  }
 }
@@ -0,0 +1,18 @@
 import { describe, expect, it } from 'vitest';
 import { Categorizer } from '../src/categorization/categorizer.js';
 describe('categorization', () => {
  it('prefers newsletter section headers', async () => {
    const categorizer = new Categorizer();
    await expect(
      categorizer.categorize({ title: 'Anything', url: 'https://x.test', section: 'Rust' })
    ).resolves.toBe('Rust');
  });
  it('falls back to URL and keyword rules', async () => {
    const categorizer = new Categorizer();
    await expect(
      categorizer.categorize({ title: 'Kubernetes security guide', url: 'https://example.com/k8s' })
    ).resolves.toBe('DevOps');
  });
 });
@@ -0,0 +1,27 @@
 import { describe, expect, it } from 'vitest';
 import { validateDateFilters } from '../src/cli/flags.js';
 import { loadConfigFromString } from '../src/config/config.js';
 describe('config validation', () => {
  it('loads a valid YAML config with defaults', () => {
    const config = loadConfigFromString(`
 gmail:
  folder: Newsletters
 output:
  name: Newsletter Link Catalog
  excel:
    enabled: true
    path: ./output/catalog.xlsx
 `);
    expect(config.gmail.folder).toBe('Newsletters');
    expect(config.links.trackingParams).toContain('utm_*');
    expect(config.enrichment.concurrency).toBe(3);
  });
  it('rejects conflicting relative and absolute date filters', () => {
    expect(() => validateDateFilters({ last: '30d', from: '2026-01-01' })).toThrow(
      /cannot be combined/i
    );
  });
 });
@@ -0,0 +1,28 @@
 import { describe, expect, it } from 'vitest';
 import { enrichLink } from '../src/enrichment/enricher.js';
 describe('enrichment', () => {
  it('marks dead, paywall, and unreachable links', async () => {
    await expect(
      enrichLink('https://x.test/dead', async () => ({
        status: 404,
        finalUrl: 'https://x.test/dead',
        html: ''
      }))
    ).resolves.toMatchObject({
      status: 'dead'
    });
    await expect(
      enrichLink('https://x.test/a', async () => ({
        status: 200,
        finalUrl: 'https://x.test/login',
        html: '<title>Login</title>'
      }))
    ).resolves.toMatchObject({ status: 'paywall' });
    await expect(
      enrichLink('https://x.test/a', async () => Promise.reject(new Error('timeout')))
    ).resolves.toMatchObject({
      status: 'unreachable'
    });
  });
 });
@@ -0,0 +1,26 @@
 import { describe, expect, it } from 'vitest';
 import { isNoiseLink, isSponsorLink } from '../src/links/filtering.js';
 describe('noise filtering', () => {
  it('filters unsubscribe, footer social, share, and mirror links', () => {
    expect(isNoiseLink({ url: 'https://x.test/unsubscribe', title: 'Unsubscribe' })).toBe(true);
    expect(
      isNoiseLink({ url: 'https://twitter.com/me', title: 'Twitter', context: 'footer' })
    ).toBe(true);
    expect(isNoiseLink({ url: 'https://x.test/share', title: 'Share this newsletter' })).toBe(true);
    expect(isNoiseLink({ url: 'https://x.test/view', title: 'View in browser' })).toBe(true);
  });
 });
 describe('sponsor detection', () => {
  it('detects sponsor links from section and surrounding text', () => {
    expect(
      isSponsorLink({
        url: 'https://sponsor.example',
        title: 'Acme',
        section: 'Sponsored',
        context: 'Partner message'
      })
    ).toBe(true);
  });
 });
@@ -0,0 +1,13 @@
 import { describe, expect, it } from 'vitest';
 import { selectParser } from '../src/parsing/plugins.js';
 describe('parser plugin selection', () => {
  it('selects Substack for Substack headers and generic otherwise', () => {
    expect(selectParser({ headers: { listId: 'thing.substack.com' }, html: '' }).name).toBe(
      'substack'
    );
    expect(
      selectParser({ headers: {}, html: '<a href="https://example.com">Example</a>' }).name
    ).toBe('generic');
  });
 });
@@ -0,0 +1,44 @@
 import { mkdtemp, rm } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 import { afterEach, beforeEach, describe, expect, it } from 'vitest';
 import { runCatalog } from '../src/run/runCatalog.js';
 let dir = '';
 beforeEach(async () => {
  dir = await mkdtemp(join(tmpdir(), 'nlc-run-'));
 });
 afterEach(async () => {
  await rm(dir, { force: true, recursive: true });
 });
 describe('run orchestration', () => {
  it('does not write output or state during dry run', async () => {
    const stateFile = join(dir, 'state.json');
    const writes: unknown[] = [];
    const result = await runCatalog({
      dryRun: 1,
      skipEnrich: true,
      config: {
        gmail: { folder: 'Newsletters' },
        output: { name: 'Catalog', excel: { enabled: true, path: join(dir, 'out.xlsx') } },
        stateFile
      },
      messages: [
        {
          id: 'msg-1',
          messageId: '<msg-1>',
          from: 'A <a@example.com>',
          date: '2026-05-16T00:00:00.000Z',
          html: '<h2>Python</h2><p><a href="https://example.com?utm_source=x">Article</a></p>'
        }
      ],
      writers: [{ write: async (payload) => writes.push(payload) }]
    });
    expect(result.linksExtracted).toBe(1);
    expect(writes).toHaveLength(0);
  });
 });
@@ -0,0 +1,15 @@
 import { describe, expect, it } from 'vitest';
 import { escapeCell, sanitizeSheetName } from '../src/output/sheets.js';
 describe('sheet output helpers', () => {
  it('sanitizes and truncates sheet names', () => {
    const name = sanitizeSheetName('Bad:/\\\\?*[] name '.repeat(12));
    expect(name).not.toMatch(/[:/\\?*[\]]/);
    expect(name.length).toBeLessThanOrEqual(100);
  });
  it('escapes formula-like cell values', () => {
    expect(escapeCell('=IMPORTXML("http://bad")')).toBe('\'=IMPORTXML("http://bad")');
  });
 });
@@ -0,0 +1,25 @@
 import { mkdtemp, rm } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 import { afterEach, beforeEach, describe, expect, it } from 'vitest';
 import { StateStore } from '../src/state/state.js';
 let dir = '';
 beforeEach(async () => {
  dir = await mkdtemp(join(tmpdir(), 'nlc-'));
 });
 afterEach(async () => {
  await rm(dir, { force: true, recursive: true });
 });
 describe('state persistence', () => {
  it('tracks processed messages incrementally', async () => {
    const store = new StateStore(join(dir, 'state.json'));
    expect(await store.isProcessed('msg-1')).toBe(false);
    await store.markProcessed('msg-1');
    expect(await store.isProcessed('msg-1')).toBe(true);
  });
 });
@@ -0,0 +1,38 @@
 import { describe, expect, it } from 'vitest';
 import { cleanupUrl, mergeReadMoreLinks } from '../src/links/url.js';
 import { ExtractedLink } from '../src/parsing/types.js';
 describe('URL cleanup', () => {
  it('strips tracking parameters and unwraps supported redirect URLs', () => {
    const result = cleanupUrl(
      'https://newsletter.example/redirect?url=https%3A%2F%2Fexample.com%2Fpost%3Futm_source%3Dx%26id%3D1&mc_cid=abc',
      { trackingParams: ['utm_*', 'mc_cid'], unwrapRedirects: true }
    );
    expect(result).toBe('https://example.com/post?id=1');
  });
 });
 describe('read-more merging', () => {
  it('merges a read-more link into the preceding link with the same normalized URL', () => {
    const links: ExtractedLink[] = [
      {
        url: 'https://example.com/a',
        normalizedUrl: 'https://example.com/a',
        title: 'Great article',
        description: 'A useful summary',
        sourceText: 'Great article',
        section: 'Python'
      },
      {
        url: 'https://example.com/a?utm_source=x',
        normalizedUrl: 'https://example.com/a',
        title: 'Read more',
        description: '',
        sourceText: 'Read more'
      }
    ];
    expect(mergeReadMoreLinks(links, /^(read more)$/i)).toHaveLength(1);
  });
 });
@@ -0,0 +1,15 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "module": "NodeNext",
    "moduleResolution": "NodeNext",
    "strict": true,
    "esModuleInterop": true,
    "forceConsistentCasingInFileNames": true,
    "skipLibCheck": true,
    "outDir": "dist",
    "rootDir": ".",
    "types": ["node", "vitest/globals"]
  },
  "include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"]
 }
@@ -0,0 +1,12 @@
 import { defineConfig } from 'tsup';
 export default defineConfig({
  entry: ['src/index.ts'],
  format: ['esm'],
  dts: true,
  clean: true,
  sourcemap: true,
  banner: {
    js: '#!/usr/bin/env node'
  }
 });
@@ -0,0 +1,9 @@
 import { defineConfig } from 'vitest/config';
 export default defineConfig({
  test: {
    environment: 'node',
    globals: true,
    include: ['tests/**/*.test.ts']
  }
 });
		`@@ -0,0 +1,3 @@`
							`import { chmod } from 'node:fs/promises';`

							`await chmod('dist/index.js', 0o755).catch(() => undefined);`