diff --git a/CLAUDE.md b/CLAUDE.md index c38d0d9..3235e1b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -55,6 +55,7 @@ packages/ npx @nitpicker/cli crawl [...] [options] # Web サイトをクロールして .nitpicker ファイルを生成(複数 URL で multi-root) npx @nitpicker/cli crawl --append [--append ...] # 既存アーカイブに新しい起点 URL を追加クロール npx @nitpicker/cli crawl --retry-failed [--no-recursive] # 既存アーカイブの失敗ページ(status -1/NULL・content-type NULL・5xx)を再取得 +npx @nitpicker/cli crawl --inventory # URL リストファイルと既存アーカイブを突合、新規 URL のみ取り込み(孤立ページ・未使用ファイル発見用) npx @nitpicker/cli analyze [options] # .nitpicker ファイルに対して analyze プラグインを実行 npx @nitpicker/cli report [options] # .nitpicker ファイルから Google Sheets レポートを生成 npx @nitpicker/cli pipeline [options] # crawl → analyze → report を直列実行 @@ -67,6 +68,8 @@ npx @nitpicker/cli -v | --version # `@nitpicker/cli` の > **`--append `**: 位置引数で指定された既存 `.nitpicker` を開き、`--append` の URL を新しい起点として追加クロールする(`--append` は繰り返し指定で複数 URL 可)。新スコープに該当する旧 external ページは internal として再スクレイプされる。失敗時は `.bak` から自動復元、成功時は `.bak` 削除。`--resume` / `--diff` / `--output` / `--list` / `--list-file` / `--single` との同時指定は不可。 +> **`--inventory `**: 位置引数で指定された既存 `.nitpicker` を開き、URL リストファイル中の **アーカイブにまだ無い URL だけ** を取り込む。HTML は puppeteer でレンダリング + 再帰クロール、非 HTML は HEAD のみで `resources` に直接登録。新規 page/resource には `source` 列に `'inventory-seed'`(リスト直接由来)または `'inventory-discovered'`(seed からのリンク follow / puppeteer サブリソース)がラベリングされる。既存行は touch しない(2 回目以降の inventory pass は非破壊、`inventory-seed` 行が demote されることはない)。スコープ外 URL は警告 skip。失敗時は `.bak` から自動復元、成功時 `.bak` 削除。`query isolated-pages` / `query unused-resources` の入力データを増やすのが主用途。`--append` / `--retry-failed` / `--resume` / `--diff` / `--output` / `--list` / `--list-file` / `--single` との同時指定は不可。 +> > **`--retry-failed`**: 位置引数で指定された既存 `.nitpicker` を開き、前回クロールで失敗したページだけを pending に戻して再取得する。失敗の定義は `status = -1`(ハード失敗 sentinel)/ `status IS NULL` / `contentType IS NULL` / `status` が 5xx(4xx は確定応答なので対象外)。internal/external 両方が対象で、external は scope 判定により metadata-only として再取得される。実装は append と同じ「再オープン+`.bak`+`Crawler.resume()`+`crawling()`」フローだが、新起点を足す代わりに `Archive.resetFailedPages()`(→ `Database.resetFailedPages`)で失敗ページを `scraped=0` に戻し、archived roots を seed にして失敗ページを resumedPending 経由で処理する(external 失敗ページを scope へ誤登録しないため)。**recursive はフラグ値(デフォルト true)が優先され、アーカイブ作成時の recursive 設定は継承しない**(`crawling(list, { recursive })` で明示注入)。それ以外の設定(scope/excludes/userAgent 等)は archived 設定を流用し、明示指定したフラグのみ上書き。`--resume` / `--append` / `--diff` / `--output` / `--list` / `--list-file` / `--single` との同時指定は不可。 > > **Note**: 全ページが失敗していた(reset 後に `scraped=1` が 0 件になる)アーカイブでも取りこぼさないよう、`Crawler.start()` の resume 判定は `#resumedScraped` だけでなく `#resumedPending` も見る。`#resumedScraped` のみを見ると「全ページ失敗 retry」や「1ページもスクレイプせず中断した resume」を fresh crawl と誤判定し、pending を全部捨ててしまう。 diff --git a/README.md b/README.md index f40182b..c2c6a14 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,25 @@ npx @nitpicker/cli crawl existing.nitpicker --retry-failed - list-mode archive(`--list` / `--list-file` で作成)への append は不可 - `--resume` / `--diff` / `--output` / `--list` / `--list-file` / `--single` と同時指定不可 +### `--inventory`: サーバーファイルリストとの突合 + +サーバー側で取得した URL リストファイル(1 行 1 URL、空行 / `#` コメント可)と既存 +`.nitpicker` を突き合わせ、**まだアーカイブに無い URL だけを取り込む** モード。 +クロールでは到達できなかった「孤立 LP」「使われていない置きっぱなしファイル」を +浮かび上がらせる用途。 + +```sh +npx @nitpicker/cli crawl --inventory +``` + +- HTML 応答は puppeteer で描画して再帰クロールに乗せ、新規 page を `'inventory-seed'`、 + そこから follow したリンク先を `'inventory-discovered'` のラベルで保存する +- 非 HTML 応答(PDF / 画像 / CSS / JS …)は HEAD のみで `resources` に直接登録、ラベルは `'inventory-seed'` +- 既存 `pages` / `resources` にすでにある URL は skip(2 回目以降の `--inventory` は新規分だけ処理) +- スコープ外 URL は警告して skip +- 結果は `query isolated-pages` / `query unused-resources` で見るのが想定動線(後述) +- `--append` / `--retry-failed` / `--resume` / `--diff` / `--output` / `--list` / `--list-file` / `--single` と同時指定不可 + ### `--retry-failed`: 失敗ページの再取得 前回クロールで失敗したページだけを再取得する。「サーバ側の一時障害やタイムアウトで取りこぼしたページを、フルクロールし直さずに回収する」ためのモード。 diff --git a/packages/@nitpicker/cli/src/commands/crawl.spec.ts b/packages/@nitpicker/cli/src/commands/crawl.spec.ts index 27d4442..7c5e70e 100644 --- a/packages/@nitpicker/cli/src/commands/crawl.spec.ts +++ b/packages/@nitpicker/cli/src/commands/crawl.spec.ts @@ -13,6 +13,7 @@ const mockCrawling = vi.fn(); const mockResume = vi.fn(); const mockAppend = vi.fn(); const mockRetryFailed = vi.fn(); +const mockInventory = vi.fn(); vi.mock('@nitpicker/crawler', () => ({ CrawlerOrchestrator: { @@ -20,6 +21,7 @@ vi.mock('@nitpicker/crawler', () => ({ resume: mockResume, append: mockAppend, retryFailed: mockRetryFailed, + inventory: mockInventory, }, })); @@ -61,6 +63,7 @@ function createFlags(overrides: Partial = {}): CrawlFlags { resume: undefined, append: undefined, retryFailed: undefined, + inventory: undefined, interval: undefined, image: true, fetchExternal: true, @@ -115,6 +118,11 @@ function setupFakeOrchestrator() { return Promise.resolve(fakeOrchestrator); }); + mockInventory.mockImplementation((_path, _urls, _opts, cb) => { + cb?.(fakeOrchestrator, { baseUrl: 'https://example.com' }); + return Promise.resolve(fakeOrchestrator); + }); + return fakeOrchestrator; } @@ -752,6 +760,153 @@ describe('crawl', () => { expect(mockLog).toHaveBeenCalledWith('Options: %O', flags); }); + + it('--inventory フラグでアーカイブと URL リストを CrawlerOrchestrator.inventory に渡す', async () => { + mockReadList.mockResolvedValueOnce(['https://example.com/hidden']); + const { crawl } = await import('./crawl.js'); + + await crawl(['/tmp/test.nitpicker'], createFlags({ inventory: '/tmp/urls.txt' })); + + expect(mockInventory).toHaveBeenCalledWith( + '/tmp/test.nitpicker', + ['https://example.com/hidden'], + expect.any(Object), + expect.any(Function), + ); + }); + + it('--inventory で空ファイルの場合、エラーを投げる', async () => { + mockReadList.mockResolvedValueOnce([]); + const { crawl } = await import('./crawl.js'); + + await expect( + crawl(['/tmp/test.nitpicker'], createFlags({ inventory: '/tmp/empty.txt' })), + ).rejects.toThrow('No URLs found in inventory file: /tmp/empty.txt'); + }); + + it('--inventory に無効な URL が含まれる場合、エラーを投げる', async () => { + mockReadList.mockResolvedValueOnce(['https://example.com', 'not-a-url']); + const { crawl } = await import('./crawl.js'); + + await expect( + crawl(['/tmp/test.nitpicker'], createFlags({ inventory: '/tmp/urls.txt' })), + ).rejects.toThrow('Invalid URL: "not-a-url"'); + }); + + it('--inventory と位置引数なしの場合、エラーを投げる', async () => { + const { crawl } = await import('./crawl.js'); + + await expect(crawl([], createFlags({ inventory: '/tmp/urls.txt' }))).rejects.toThrow( + '--inventory requires the archive path as the positional argument', + ); + }); + + it('--inventory と複数位置引数の場合、エラーを投げる', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/a.nitpicker', '/tmp/b.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt' }), + ), + ).rejects.toThrow( + '--inventory takes exactly one positional argument (the archive path).', + ); + }); + + it('--inventory と --append の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ + inventory: '/tmp/urls.txt', + append: ['https://example.com/new'], + }), + ), + ).rejects.toThrow('--inventory and --append cannot be used together'); + }); + + it('--inventory と --retry-failed の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt', retryFailed: true }), + ), + ).rejects.toThrow('--inventory and --retry-failed cannot be used together'); + }); + + it('--inventory と --resume の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + [], + createFlags({ inventory: '/tmp/urls.txt', resume: '/tmp/_nitpicker-stub' }), + ), + ).rejects.toThrow('--resume and --inventory cannot be used together'); + }); + + it('--inventory と --diff の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/a.nitpicker', '/tmp/b.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt', diff: true }), + ), + ).rejects.toThrow('--diff cannot be combined with --inventory'); + }); + + it('--inventory と --output の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt', output: '/tmp/out.nitpicker' }), + ), + ).rejects.toThrow('--output flag is not supported with --inventory'); + }); + + it('--inventory と --list-file の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt', listFile: '/tmp/list.txt' }), + ), + ).rejects.toThrow('--inventory cannot be combined with --list-file'); + }); + + it('--inventory と --list の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ + inventory: '/tmp/urls.txt', + list: ['https://example.com'], + }), + ), + ).rejects.toThrow('--inventory cannot be combined with --list'); + }); + + it('--inventory と --single の同時指定はエラー', async () => { + const { crawl } = await import('./crawl.js'); + + await expect( + crawl( + ['/tmp/test.nitpicker'], + createFlags({ inventory: '/tmp/urls.txt', single: true }), + ), + ).rejects.toThrow('--inventory cannot be combined with --single'); + }); }); /** Sentinel error thrown by the process.exit mock to halt execution. */ diff --git a/packages/@nitpicker/cli/src/commands/crawl.ts b/packages/@nitpicker/cli/src/commands/crawl.ts index 51588dd..0095f63 100644 --- a/packages/@nitpicker/cli/src/commands/crawl.ts +++ b/packages/@nitpicker/cli/src/commands/crawl.ts @@ -37,6 +37,10 @@ export const commandDef = { type: 'boolean', desc: 'Retry crawl: re-fetch failed pages (missing status/content-type or a 5xx status) in the positional archive; use --no-recursive to skip re-crawling newly found URLs', }, + inventory: { + type: 'string', + desc: 'Inventory crawl: take a server-side URL list file and import only URLs that the positional archive does not yet track. HTML responses are rendered + recursively crawled; non-HTML URLs are HEAD-probed and stored directly. Use with `query isolated-pages` / `unused-resources` to surface orphan pages / unused files.', + }, interval: { type: 'number', shortFlag: 'I', @@ -336,6 +340,57 @@ async function appendCrawl(archivePath: string, newUrls: string[], flags: CrawlF } } +/** + * Inventory-mode dispatch: read the URL list file, hand it to + * {@link CrawlerOrchestrator.inventory}, and surface the result through + * the same `run` progress reporter as the other crawl modes. + * + * The URL list file is parsed by `@d-zero/readtext/list`, which strips + * blank lines and `#` comments — same conventions as `--list-file`. + * @param archivePath - Path to the existing `.nitpicker` archive (positional). + * @param listFile - Path to the URL list file passed via `--inventory`. + * @param flags - Parsed CLI flags from the `crawl` command. + */ +async function inventoryCrawl(archivePath: string, listFile: string, flags: CrawlFlags) { + const list = await readList(path.resolve(process.cwd(), listFile)); + if (list.length === 0) { + throw new Error(`No URLs found in inventory file: ${listFile}`); + } + validateUrls(list); + const errStack: (CrawlerError | Error)[] = []; + + const orchestrator = await CrawlerOrchestrator.inventory( + archivePath, + list, + { + ...mapFlagsToCrawlConfig(flags), + list: false, + }, + (orchestrator, config) => { + run( + `${archivePath} (inventory: ${listFile})`, + orchestrator, + config, + flags.verbose ? 'verbose' : flags.silent ? 'silent' : 'normal', + ).catch((error) => errStack.push(error)); + }, + ); + + try { + await orchestrator.write(); + } finally { + await orchestrator.archive.close(); + orchestrator.garbageCollect(); + } + + if (errStack.length > 0) { + const error = new CrawlAggregateError(errStack); + // eslint-disable-next-line no-console + console.error(`\n${error.message}`); + throw error; + } +} + /** * Re-fetch failed pages in an existing `.nitpicker` archive and re-crawl. * @@ -420,6 +475,7 @@ export async function crawl(args: string[], flags: CrawlFlags) { log('Options: %O', flags); const hasAppendFlag = !!flags.append && flags.append.length > 0; + const hasInventoryFlag = !!flags.inventory; if (flags.diff) { if (hasAppendFlag) { @@ -428,6 +484,9 @@ export async function crawl(args: string[], flags: CrawlFlags) { if (flags.retryFailed) { throw new Error('--diff cannot be combined with --retry-failed.'); } + if (hasInventoryFlag) { + throw new Error('--diff cannot be combined with --inventory.'); + } if (args.length !== 2) { throw new Error('--diff takes exactly two file paths to compare'); } @@ -463,10 +522,54 @@ export async function crawl(args: string[], flags: CrawlFlags) { '--resume and --retry-failed cannot be used together. Pick the existing-archive mode that fits your task.', ); } + if (hasInventoryFlag) { + throw new Error( + '--resume and --inventory cannot be used together. Pick the existing-archive mode that fits your task.', + ); + } await resumeCrawl(flags.resume, flags); return; } + if (hasInventoryFlag) { + if (hasAppendFlag) { + throw new Error( + '--inventory and --append cannot be used together. Pick the existing-archive mode that fits your task.', + ); + } + if (flags.retryFailed) { + throw new Error( + '--inventory and --retry-failed cannot be used together. Pick the existing-archive mode that fits your task.', + ); + } + if (flags.output) { + throw new Error( + '--output flag is not supported with --inventory. The archive path is the positional argument being inventoried.', + ); + } + if (flags.listFile) { + throw new Error('--inventory cannot be combined with --list-file.'); + } + if (hasListFlag) { + throw new Error('--inventory cannot be combined with --list.'); + } + if (flags.single) { + throw new Error('--inventory cannot be combined with --single.'); + } + if (args.length === 0) { + throw new Error( + '--inventory requires the archive path as the positional argument (usage: crawl --inventory ).', + ); + } + if (args.length > 1) { + throw new Error( + '--inventory takes exactly one positional argument (the archive path).', + ); + } + await inventoryCrawl(args[0]!, flags.inventory!, flags); + return; + } + if (hasAppendFlag) { if (flags.retryFailed) { throw new Error( diff --git a/packages/@nitpicker/cli/src/commands/pipeline.ts b/packages/@nitpicker/cli/src/commands/pipeline.ts index 4dbf050..bbf163b 100644 --- a/packages/@nitpicker/cli/src/commands/pipeline.ts +++ b/packages/@nitpicker/cli/src/commands/pipeline.ts @@ -239,6 +239,7 @@ export async function pipeline(args: string[], flags: PipelineFlags) { resume: undefined, append: [], retryFailed: false, + inventory: undefined, diff: undefined, }); } catch (error) { diff --git a/packages/@nitpicker/cli/src/query/dispatch-query.spec.ts b/packages/@nitpicker/cli/src/query/dispatch-query.spec.ts index be04d00..64e535f 100644 --- a/packages/@nitpicker/cli/src/query/dispatch-query.spec.ts +++ b/packages/@nitpicker/cli/src/query/dispatch-query.spec.ts @@ -23,6 +23,8 @@ vi.mock('@nitpicker/query', () => ({ pageUrls: [], total: 0, }), + listIsolatedPages: vi.fn().mockResolvedValue({ items: [], total: 0 }), + listUnusedResources: vi.fn().mockResolvedValue({ items: [], total: 0 }), ArchiveManager: vi.fn(), })); @@ -229,4 +231,30 @@ describe('dispatchQuery', () => { } as never), ).rejects.toThrow('Resource not found: https://missing.example.com/style.css'); }); + + it('dispatches isolated-pages sub-command with limit and offset', async () => { + const { listIsolatedPages } = await import('@nitpicker/query'); + const result = await dispatchQuery(mockAccessor, 'isolated-pages', { + limit: 50, + offset: 25, + } as never); + expect(result).toEqual({ items: [], total: 0 }); + expect(listIsolatedPages).toHaveBeenCalledWith(mockAccessor, { + limit: 50, + offset: 25, + }); + }); + + it('dispatches unused-resources sub-command with limit and offset', async () => { + const { listUnusedResources } = await import('@nitpicker/query'); + const result = await dispatchQuery(mockAccessor, 'unused-resources', { + limit: 10, + offset: 0, + } as never); + expect(result).toEqual({ items: [], total: 0 }); + expect(listUnusedResources).toHaveBeenCalledWith(mockAccessor, { + limit: 10, + offset: 0, + }); + }); }); diff --git a/packages/@nitpicker/cli/src/query/dispatch-query.ts b/packages/@nitpicker/cli/src/query/dispatch-query.ts index bacb920..469d4db 100644 --- a/packages/@nitpicker/cli/src/query/dispatch-query.ts +++ b/packages/@nitpicker/cli/src/query/dispatch-query.ts @@ -27,11 +27,13 @@ import { getTagInventory, getViolations, listImages, + listIsolatedPages, listLinks, listPages, listPagesByJsonLdType, listPagesByTag, listResources, + listUnusedResources, } from '@nitpicker/query'; import { mapFlagsToQueryOptions } from './map-flags-to-query-options.js'; @@ -164,6 +166,14 @@ export async function dispatchQuery( const { url } = options as { url: string }; return getPageJsonLdOverview(accessor, url); } + case 'isolated-pages': { + const { limit, offset } = options as { limit?: number; offset?: number }; + return listIsolatedPages(accessor, { limit, offset }); + } + case 'unused-resources': { + const { limit, offset } = options as { limit?: number; offset?: number }; + return listUnusedResources(accessor, { limit, offset }); + } case 'page-tags': { const { url } = options as { url: string }; return getPageTags(accessor, url); diff --git a/packages/@nitpicker/cli/src/query/map-flags-to-query-options.spec.ts b/packages/@nitpicker/cli/src/query/map-flags-to-query-options.spec.ts index 1496613..2d056f1 100644 --- a/packages/@nitpicker/cli/src/query/map-flags-to-query-options.spec.ts +++ b/packages/@nitpicker/cli/src/query/map-flags-to-query-options.spec.ts @@ -210,4 +210,25 @@ describe('mapFlagsToQueryOptions', () => { url: 'https://example.com/style.css', }); }); + + it('returns limit/offset only for isolated-pages (no required filters)', () => { + expect(mapFlagsToQueryOptions('isolated-pages', { limit: 50, offset: 10 })).toEqual({ + limit: 50, + offset: 10, + }); + }); + + it('returns limit/offset only for unused-resources (no required filters)', () => { + expect(mapFlagsToQueryOptions('unused-resources', { limit: 25, offset: 0 })).toEqual({ + limit: 25, + offset: 0, + }); + }); + + it('passes through undefined limit/offset for isolated-pages so query helper defaults apply', () => { + expect(mapFlagsToQueryOptions('isolated-pages', {})).toEqual({ + limit: undefined, + offset: undefined, + }); + }); }); diff --git a/packages/@nitpicker/cli/src/query/map-flags-to-query-options.ts b/packages/@nitpicker/cli/src/query/map-flags-to-query-options.ts index a4b5acf..6de3bc5 100644 --- a/packages/@nitpicker/cli/src/query/map-flags-to-query-options.ts +++ b/packages/@nitpicker/cli/src/query/map-flags-to-query-options.ts @@ -237,6 +237,14 @@ export function mapFlagsToQueryOptions( } return { url: flags.url }; } + case 'isolated-pages': + case 'unused-resources': { + // Pagination-only — no required filters. + return { + limit: flags.limit, + offset: flags.offset, + }; + } default: { const _exhaustive: never = subCommand; throw new Error(`Unknown sub-command: ${String(_exhaustive)}`); diff --git a/packages/@nitpicker/cli/src/query/types.ts b/packages/@nitpicker/cli/src/query/types.ts index 80beede..e820dbe 100644 --- a/packages/@nitpicker/cli/src/query/types.ts +++ b/packages/@nitpicker/cli/src/query/types.ts @@ -22,7 +22,9 @@ export type QuerySubCommand = | 'page-tags' | 'count-pages-by-tag' | 'count-pages-by-jsonld-type' - | 'page-jsonld-overview'; + | 'page-jsonld-overview' + | 'isolated-pages' + | 'unused-resources'; /** * List of all valid query sub-command names. @@ -49,4 +51,6 @@ export const VALID_SUB_COMMANDS = [ 'count-pages-by-tag', 'count-pages-by-jsonld-type', 'page-jsonld-overview', + 'isolated-pages', + 'unused-resources', ] as const satisfies readonly QuerySubCommand[]; diff --git a/packages/@nitpicker/crawler/src/archive/archive.ts b/packages/@nitpicker/crawler/src/archive/archive.ts index 0f3b6e6..584c14c 100644 --- a/packages/@nitpicker/crawler/src/archive/archive.ts +++ b/packages/@nitpicker/crawler/src/archive/archive.ts @@ -1,4 +1,4 @@ -import type { Config } from './types.js'; +import type { Config, PageSource } from './types.js'; import type { PageData, CrawlerError, Resource } from '../utils/types/types.js'; import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url'; @@ -137,6 +137,29 @@ export default class Archive extends ArchiveAccessor { async getCrawlingState() { return this.#db.getCrawlingState(); } + /** + * Return the subset of `urls` that already exist as `pages.url`. Used by + * `CrawlerOrchestrator.inventory` to filter the user-supplied URL list + * down to "URLs that are NOT yet in the archive" — only those reach the + * HEAD / scrape pipeline. Existing URLs are skipped to keep the second + * (and N-th) `--inventory` pass non-destructive. + * @param urls - Candidate URLs in `withoutHashAndAuth` form. + * @returns URLs already present in `pages`. + */ + async getExistingPageUrls(urls: readonly string[]): Promise { + return this.#db.getExistingPageUrls(urls); + } + /** + * Return the subset of `urls` that already exist as `resources.url`. See + * {@link Archive.getExistingPageUrls} — the resource-side counterpart used + * by inventory mode to skip URLs that are already tracked as + * sub-resources. + * @param urls - Candidate URLs. + * @returns URLs already present in `resources`. + */ + async getExistingResourceUrls(urls: readonly string[]): Promise { + return this.#db.getExistingResourceUrls(urls); + } /** * Retrieves a single recorded sub-resource by its URL. * @param urls - URL candidates to match against the stored resource URL. @@ -145,6 +168,7 @@ export default class Archive extends ArchiveAccessor { async getResourceByUrl(urls: readonly string[]) { return this.#db.getResourceByUrl(urls); } + /** * Counts the number of pages already scraped as crawl targets in the archive. * @@ -181,6 +205,7 @@ export default class Archive extends ArchiveAccessor { this.#closeOnce = this.#runReleaseHandle(); return this.#closeOnce; } + /** * Promote previously-external pages that now fall under the (possibly extended) * scope back to a pending state so that the crawler re-scrapes them as fully @@ -220,10 +245,11 @@ export default class Archive extends ArchiveAccessor { * an HTML snapshot. External-page rows carry only metadata (status, title, * content-type), never a rendered body. * @param pageInfo - The page data to store. + * @param source - Provenance label for new rows. `undefined` leaves the DB DEFAULT (`'crawled'`). */ - async setExternalPage(pageInfo: PageData) { + async setExternalPage(pageInfo: PageData, source?: PageSource) { dbLog('Set external page: %s', pageInfo.url.href); - await this.#db.updatePage(pageInfo, false, false); + await this.#db.updatePage(pageInfo, false, false, source); } /** * Stores a crawled page's data in the archive database, persisting the @@ -231,11 +257,12 @@ export default class Archive extends ArchiveAccessor { * transaction. Storage is content-addressable: identical bodies across * pages share a single `page_html_blobs` row. * @param pageInfo - The page data to store. + * @param source - Provenance label for new rows. `undefined` leaves the DB DEFAULT (`'crawled'`). * @returns The database ID of the stored page. */ - async setPage(pageInfo: PageData): Promise { + async setPage(pageInfo: PageData, source?: PageSource): Promise { dbLog('Set page: %s', pageInfo.url.href); - return await this.#db.updatePage(pageInfo, true, pageInfo.isTarget); + return await this.#db.updatePage(pageInfo, true, pageInfo.isTarget, source); } /** * Records a redirect edge without re-storing the destination's content. @@ -253,10 +280,11 @@ export default class Archive extends ArchiveAccessor { /** * Stores a sub-resource (CSS, JS, image, etc.) in the archive database. * @param resource - The resource data to store. + * @param source - Provenance label for new rows. `undefined` leaves the DB DEFAULT (`'crawled'`). */ - async setResources(resource: Resource) { + async setResources(resource: Resource, source?: PageSource) { dbLog('Set resource: %s', resource.url.href); - await this.#db.insertResource(resource); + await this.#db.insertResource(resource, source); } /** * Stores the referrer relationship between a resource and the page that references it. diff --git a/packages/@nitpicker/crawler/src/archive/database.spec.ts b/packages/@nitpicker/crawler/src/archive/database.spec.ts index 183423e..2666609 100644 --- a/packages/@nitpicker/crawler/src/archive/database.spec.ts +++ b/packages/@nitpicker/crawler/src/archive/database.spec.ts @@ -1,5 +1,6 @@ import type { Config } from './types.js'; +import fs from 'node:fs/promises'; import path from 'node:path'; import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url'; @@ -9,6 +10,17 @@ import { Database } from './database.js'; import { remove } from './filesystem/remove.js'; import { LibsqlDialect } from './libsql-dialect.js'; +/** + * Force-remove a temp DB file. Unlike `remove()`, this is ENOENT-tolerant — + * the new `getExistingPageUrls / getExistingResourceUrls` specs call it + * BEFORE the SQLite file exists (to guarantee a clean slate per test), so + * a missing file must not throw. + * @param filePath - Absolute path to the SQLite fixture file. + */ +async function removeIfExists(filePath: string): Promise { + await fs.rm(filePath, { force: true, recursive: true }); +} + const __filename = new URL(import.meta.url).pathname; const __dirname = path.dirname(__filename); const workingDir = path.resolve(__dirname, '__mock__'); @@ -310,7 +322,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(await getRefByUrl(db, url)).toBeUndefined(); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -327,7 +339,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(Buffer.from(ref!.hash)).toHaveLength(32); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -340,7 +352,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(await getRefByUrl(db, url)).toBeUndefined(); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -357,7 +369,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(await getRefByUrl(db, url)).toBeDefined(); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -377,7 +389,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(await getRefByUrl(db, url)).toBeUndefined(); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -399,7 +411,7 @@ describe('snapshot 付与: 非HTML / 空html にスナップショットを作 expect(Buffer.from(after!.hash).equals(Buffer.from(before!.hash))).toBe(true); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); }); @@ -443,7 +455,7 @@ describe('content-type の正規化(#72)', () => { expect(pages.some((p) => p.url === url)).toBe(true); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); }); @@ -582,7 +594,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(hrefs).toEqual(['http://localhost/target-a', 'http://localhost/target-c']); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -653,7 +665,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(Number(imageRow.c)).toBe(1); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -744,7 +756,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(refsAfter.map((r) => r.url)).not.toContain(oldUrl); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -864,7 +876,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(srcRefs).toHaveLength(0); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -925,7 +937,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(Number(row.c)).toBe(3); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -977,7 +989,7 @@ describe('re-scrape: 同一ページの再 updatePage', () => { expect(Number(row.c)).toBe(2); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); }); @@ -1066,7 +1078,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(sourcePage.redirectDestId).toBe(destPage.id); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -1085,7 +1097,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(page.redirectDestId).toBeNull(); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -1117,7 +1129,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(middle.redirectDestId).toBe(destPage.id); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -1150,7 +1162,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(Number(after.c)).toBe(0); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -1171,7 +1183,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(Number(row.c)).toBe(0); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); @@ -1194,7 +1206,7 @@ describe('recordRedirect: 宛先を再保存せず辺だけ記録する(#73) expect(Number(row.c)).toBe(0); } finally { await db.destroy(); - await remove(dbPath); + await removeIfExists(dbPath); } }); }); @@ -2543,3 +2555,127 @@ describe('getResourceByUrl', () => { } }); }); + +describe('getExistingPageUrls / getExistingResourceUrls', () => { + it('returns an empty array when called with an empty input', async () => { + const dbPath = path.resolve(workingDir, 'existing-empty.sqlite'); + await removeIfExists(dbPath); + const db = await Database.connect({ filename: dbPath }); + try { + expect(await db.getExistingPageUrls([])).toEqual([]); + expect(await db.getExistingResourceUrls([])).toEqual([]); + } finally { + await db.destroy(); + await removeIfExists(dbPath); + } + }); + + it('returns only the candidate URLs that already exist in pages', async () => { + const dbPath = path.resolve(workingDir, 'existing-pages.sqlite'); + await removeIfExists(dbPath); + const db = await Database.connect({ filename: dbPath }); + try { + await db.updatePage( + { + url: parseUrl('http://localhost/known')!, + redirectPaths: [], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 1, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: 'known' }, + anchorList: [], + imageList: [], + html: '', + isSkipped: false, + }, + true, + true, + ); + + const result = await db.getExistingPageUrls([ + 'http://localhost/known', + 'http://localhost/unknown', + ]); + expect(result.toSorted()).toEqual(['http://localhost/known']); + } finally { + await db.destroy(); + await removeIfExists(dbPath); + } + }); + + it('returns only the candidate URLs that already exist in resources', async () => { + const dbPath = path.resolve(workingDir, 'existing-resources.sqlite'); + await removeIfExists(dbPath); + const db = await Database.connect({ filename: dbPath }); + try { + await db.insertResource({ + url: parseUrl('http://localhost/known.css')!, + isExternal: false, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'text/css', + contentLength: 100, + compress: false, + cdn: false, + headers: {}, + }); + + const result = await db.getExistingResourceUrls([ + 'http://localhost/known.css', + 'http://localhost/unknown.css', + ]); + expect(result.toSorted()).toEqual(['http://localhost/known.css']); + } finally { + await db.destroy(); + await removeIfExists(dbPath); + } + }); + + it('handles input arrays larger than the chunk size (500)', async () => { + const dbPath = path.resolve(workingDir, 'existing-chunked.sqlite'); + await removeIfExists(dbPath); + const db = await Database.connect({ filename: dbPath }); + try { + // Insert 750 pages (1.5× chunk size) so getExistingPageUrls has to + // iterate the eachSplitted batches at least twice and merge + // results across them. + for (let i = 0; i < 750; i++) { + await db.updatePage( + { + url: parseUrl(`http://localhost/page-${i}`)!, + redirectPaths: [], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 1, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: `page-${i}` }, + anchorList: [], + imageList: [], + html: '', + isSkipped: false, + }, + true, + true, + ); + } + + // Probe with 1000 candidates — 750 known, 250 unknown — split + // across multiple chunk batches inside the helper. + const candidates: string[] = []; + for (let i = 0; i < 1000; i++) { + candidates.push(`http://localhost/page-${i}`); + } + const result = await db.getExistingPageUrls(candidates); + expect(result.length).toBe(750); + } finally { + await db.destroy(); + await removeIfExists(dbPath); + } + }); +}); diff --git a/packages/@nitpicker/crawler/src/archive/database.ts b/packages/@nitpicker/crawler/src/archive/database.ts index 9d123b8..18662f6 100644 --- a/packages/@nitpicker/crawler/src/archive/database.ts +++ b/packages/@nitpicker/crawler/src/archive/database.ts @@ -9,6 +9,7 @@ import type { DB_Resource, DatabaseEvent, PageFilter, + PageSource, } from './types.js'; import type { PageData, Resource } from '../utils/types/types.js'; import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url'; @@ -48,6 +49,7 @@ import { migrateCrawlErrors } from './migrate-crawl-errors.js'; import { migrateHtmlBlobTables } from './migrate-html-blob-tables.js'; import { migrateInfoRoots } from './migrate-info-roots.js'; import { migratePageErrors } from './migrate-page-errors.js'; +import { migratePagesResourcesSource } from './migrate-pages-resources-source.js'; import { redirectTable } from './redirect-table.js'; import { resolveRedirectChain } from './resolve-redirect-chain.js'; @@ -376,6 +378,58 @@ export class Database extends EventEmitter { pending, }; } + /** + * Return the subset of `urls` that already exist in the `pages` table. + * Chunked into batches so SQLite's `IN (?, ?, …)` parameter limit + * (`SQLITE_MAX_VARIABLE_NUMBER`, default 999) cannot be hit even when the + * inventory list contains tens of thousands of URLs. + * + * Read-only — no transaction, no lock contention with the crawler write + * pipeline (callers run this BEFORE the `.bak` is taken and the + * crawl is started). + * @param urls - URL strings to probe (already in `withoutHashAndAuth` form). + * @returns URLs found in `pages`. Order is not preserved. + */ + @ErrorEmitter() + async getExistingPageUrls(urls: readonly string[]): Promise { + if (urls.length === 0) { + return []; + } + const found: string[] = []; + await eachSplitted([...urls], 500, async (chunk) => { + const rows = await this.#instance + .select('url') + .from('pages') + .whereIn('url', chunk); + for (const row of rows) { + found.push(row.url); + } + }); + return found; + } + /** + * Return the subset of `urls` that already exist in the `resources` table. + * See {@link Database.getExistingPageUrls} — same chunking strategy. + * @param urls - URL strings to probe. + * @returns URLs found in `resources`. + */ + @ErrorEmitter() + async getExistingResourceUrls(urls: readonly string[]): Promise { + if (urls.length === 0) { + return []; + } + const found: string[] = []; + await eachSplitted([...urls], 500, async (chunk) => { + const rows = await this.#instance + .select('url') + .from('resources') + .whereIn('url', chunk); + for (const row of rows) { + found.push(row.url); + } + }); + return found; + } /** * Reads the HTML snapshot stored as a zstd-compressed BLOB for the given page. * @@ -405,6 +459,7 @@ export class Database extends EventEmitter { } return decodeStoredBlob(row.body, row.codec); } + /** * Retrieves all `page_jsonld` rows for the given page id, parsed back into * {@link JsonLdRow} shape (with `parsed` deserialised from its JSON column). @@ -896,11 +951,17 @@ export class Database extends EventEmitter { /** * Inserts a sub-resource into the `resources` table. * Ignores duplicate URLs (uses `ON CONFLICT IGNORE`). + * + * The `source` provenance label is written ONLY on insert; an + * `ON CONFLICT IGNORE` collision leaves an existing row's source untouched + * (this is what makes a second `crawl --inventory` non-destructive — see + * the inventory plan). * @param resource - The resource data to insert. + * @param source - Provenance label for new rows. `undefined` leaves the DB DEFAULT (`'crawled'`). */ @ErrorEmitter() @retry(retrySetting) - async insertResource(resource: Resource) { + async insertResource(resource: Resource, source?: PageSource) { await this.#instance .from('resources') .insert({ @@ -915,6 +976,7 @@ export class Database extends EventEmitter { compress: resource.compress || 0, cdn: resource.cdn || 0, responseHeaders: JSON.stringify(resource.headers), + ...(source === undefined ? {} : { source }), }) .onConflict('url') .ignore(); @@ -1319,6 +1381,10 @@ export class Database extends EventEmitter { * metadata-only scrapes never carry HTML and must not perturb an already * stored body. * @param isTarget - Whether this page is a crawl target. + * @param source - Provenance label written ONLY when the row is freshly + * inserted. Existing rows keep their original `source` (this is why a + * second `crawl --inventory` does not "demote" an `'inventory-seed'` row + * that was discovered earlier). * @returns The database `pageId` of the inserted/updated row. */ @ErrorEmitter() @@ -1327,6 +1393,7 @@ export class Database extends EventEmitter { page: PageData, writeHtml: boolean, isTarget: boolean, + source?: PageSource, ): Promise { const { destUrl, sources } = resolveRedirectChain( page.url.withoutHashAndAuth, @@ -1347,6 +1414,7 @@ export class Database extends EventEmitter { }, isTarget, trx, + source, ); // Wappalyzer tag detection is HTML-body independent (relies on @@ -1460,11 +1528,23 @@ export class Database extends EventEmitter { /** * Returns the database ID for a URL, creating a new page row if needed. * Uses `ON CONFLICT IGNORE` to handle race conditions in concurrent inserts. + * + * `source` is written ONLY on the INSERT path — when the row already + * exists, we never reach the INSERT and the existing row's `source` + * stays untouched. This is what keeps a second `crawl --inventory` from + * "demoting" a page that was first labelled `'inventory-seed'` back to + * `'inventory-discovered'` on later passes. * @param url * @param isExternal * @param trx + * @param source - Provenance label to put on the newly-inserted row. `undefined` lets the DB DEFAULT (`'crawled'`) apply. */ - async #getIdByUrl(url: string, isExternal?: 0 | 1, trx?: Knex.Transaction) { + async #getIdByUrl( + url: string, + isExternal?: 0 | 1, + trx?: Knex.Transaction, + source?: PageSource, + ) { const qb = trx ?? this.#instance; const [record] = await qb.select('id').from('pages').where('url', url); // Must use `?` because it may be `undefined` @@ -1478,6 +1558,7 @@ export class Database extends EventEmitter { scraped: 0, isTarget: 0, ...(isExternal != null && { isExternal }), + ...(source === undefined ? {} : { source }), }) .onConflict('url') .ignore(); @@ -1523,6 +1604,7 @@ export class Database extends EventEmitter { await migratePageErrors(this.#instance); await migrateCrawlErrors(this.#instance); await migrateHtmlBlobTables(this.#instance); + await migratePagesResourcesSource(this.#instance); } /** * Replaces the page's JSON-LD / SpeculationRules rows with the freshly @@ -1583,17 +1665,48 @@ export class Database extends EventEmitter { } /** * Upserts page data into the `pages` table (inserts if new, updates if existing). + * + * `source` is intentionally NOT in the UPDATE clause — provenance is set + * once at INSERT time inside `#getIdByUrl`, and existing rows keep + * whatever label they were first inserted with. * @param page * @param isTarget * @param trx + * @param source - Inventory provenance for the INSERT path. Ignored on UPDATE. */ - async #insertPage(page: PageData, isTarget: boolean, trx?: Knex.Transaction) { + async #insertPage( + page: PageData, + isTarget: boolean, + trx?: Knex.Transaction, + source?: PageSource, + ) { const qb = trx ?? this.#instance; - const pageId = await this.#getIdByUrl(page.url.withoutHashAndAuth, undefined, trx); + const pageId = await this.#getIdByUrl( + page.url.withoutHashAndAuth, + undefined, + trx, + source, + ); const flat = deriveFlatFromMeta(page.meta, page.url.href); const denorm = computePageDenormalized(page.meta); const extras = deriveMetaExtras(page.meta); const now = Date.now(); + // Source promotion on UPDATE: when an inventory-mode scrape lands on + // a row that was created earlier as a placeholder (e.g. an anchor + // from a seed page pointed at this URL and `#getIdByUrl` inserted a + // row with the DB DEFAULT `'crawled'`), bump the label to the + // inventory variant. But never demote an already-inventoried row — + // `CASE WHEN source = 'crawled' THEN ? ELSE source END` keeps a + // previously labelled `'inventory-seed'` or `'inventory-discovered'` + // row intact on a second pass. + const sourceUpdate = + source === undefined + ? {} + : { + source: qb.raw("CASE WHEN source = 'crawled' THEN ? ELSE source END", [ + source, + ]), + }; await qb('pages') .where('id', pageId) .update({ @@ -1633,6 +1746,7 @@ export class Database extends EventEmitter { firstCrawledAt: qb.raw('COALESCE(firstCrawledAt, ?)', [now]), lastCrawledAt: now, isSkipped: page.isSkipped, + ...sourceUpdate, }); return pageId; } diff --git a/packages/@nitpicker/crawler/src/archive/init-schema.ts b/packages/@nitpicker/crawler/src/archive/init-schema.ts index 73b2cd1..bf7a7ba 100644 --- a/packages/@nitpicker/crawler/src/archive/init-schema.ts +++ b/packages/@nitpicker/crawler/src/archive/init-schema.ts @@ -193,6 +193,14 @@ export async function initSchema(instance: Knex) { t.string('skipReason'); t.integer('order').unsigned().nullable(); + // Provenance: which channel inserted this row. Values: + // 'crawled' — discovered via the recursive crawl from one of `info.roots` + // 'inventory-seed' — supplied directly by `crawl --inventory` URL list + // 'inventory-discovered' — found by following links from an `inventory-seed` page + // Used by `listIsolatedPages` only for badge display; isolation + // itself is judged by `anchors.hrefId IS NULL`, not by source. + t.string('source').notNullable().defaultTo('crawled'); + t.index('isExternal'); t.index('contentType'); t.index('scraped'); @@ -205,6 +213,7 @@ export async function initSchema(instance: Knex) { // skipped. t.index('robots_noindex'); t.index('og_type'); + t.index('source'); }) .createTable('anchors', (t) => { t.increments('id'); @@ -243,6 +252,13 @@ export async function initSchema(instance: Knex) { t.string('compress').nullable(); t.string('cdn').nullable(); t.json('responseHeaders').nullable(); + // See `pages.source` for the provenance taxonomy. `inventory-seed` + // rows here come from non-HTML URLs handed in by + // `crawl --inventory`; `inventory-discovered` rows are sub-resources + // pulled in while puppeteer rendered an inventory-seed page. + t.string('source').notNullable().defaultTo('crawled'); + + t.index('source'); }) .createTable('resources-referrers', (t) => { t.increments('id'); diff --git a/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.spec.ts b/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.spec.ts new file mode 100644 index 0000000..3a13217 --- /dev/null +++ b/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.spec.ts @@ -0,0 +1,189 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +import knex from 'knex'; +import { afterEach, describe, expect, it } from 'vitest'; + +import { LibsqlDialect } from './libsql-dialect.js'; +import { migratePagesResourcesSource } from './migrate-pages-resources-source.js'; + +const __filename = new URL(import.meta.url).pathname; +const __dirname = path.dirname(__filename); +const workingDir = path.resolve(__dirname, '__mock__'); + +const FIXTURES = [ + 'migrate-source-both.sqlite', + 'migrate-source-idempotent.sqlite', + 'migrate-source-partial.sqlite', + 'migrate-source-empty.sqlite', + 'migrate-source-resources-only.sqlite', + 'migrate-source-pages-only.sqlite', + 'migrate-source-default-backfill.sqlite', +]; + +/** + * Build a knex instance against a temp SQLite file that simulates a v0.10 + * archive predating the inventory feature: pages / resources tables exist + * with their original column set but no `source` column. + * @param fileName - Name of the SQLite file relative to workingDir. + * @returns The connected knex instance and full file path. + */ +async function buildLegacySchema(fileName: string) { + const filename = path.resolve(workingDir, fileName); + await fs.rm(filename, { force: true }); + const instance = knex({ + client: LibsqlDialect as never, + connection: { filename }, + useNullAsDefault: true, + }); + await instance.schema.createTable('pages', (t) => { + t.increments('id'); + t.string('url').notNullable().unique(); + t.boolean('scraped').notNullable().defaultTo(0); + t.boolean('isTarget').notNullable().defaultTo(0); + }); + await instance.schema.createTable('resources', (t) => { + t.increments('id'); + t.string('url').notNullable().unique(); + t.integer('status'); + }); + return { instance, filename }; +} + +afterEach(async () => { + for (const name of FIXTURES) { + await fs.rm(path.resolve(workingDir, name), { force: true }); + } +}); + +describe('migratePagesResourcesSource', () => { + it('adds source columns to both pages and resources and backfills existing rows with the default', async () => { + const { instance } = await buildLegacySchema('migrate-source-both.sqlite'); + await instance('pages').insert({ + url: 'https://example.com/', + scraped: 1, + isTarget: 1, + }); + await instance('resources').insert({ url: 'https://example.com/a.css', status: 200 }); + + await migratePagesResourcesSource(instance); + + expect(await instance.schema.hasColumn('pages', 'source')).toBe(true); + expect(await instance.schema.hasColumn('resources', 'source')).toBe(true); + + const [pageRow] = await instance.select('source').from('pages'); + const [resourceRow] = await instance.select('source').from('resources'); + expect(pageRow.source).toBe('crawled'); + expect(resourceRow.source).toBe('crawled'); + + await instance.destroy(); + }); + + it('is idempotent — calling twice on a migrated schema is a no-op', async () => { + const { instance } = await buildLegacySchema('migrate-source-idempotent.sqlite'); + + await migratePagesResourcesSource(instance); + await migratePagesResourcesSource(instance); + + expect(await instance.schema.hasColumn('pages', 'source')).toBe(true); + expect(await instance.schema.hasColumn('resources', 'source')).toBe(true); + + await instance.destroy(); + }); + + it('handles partial migration recovery (pages.source already added, resources.source missing)', async () => { + const { instance } = await buildLegacySchema('migrate-source-partial.sqlite'); + // Simulate a previous run that added pages.source but crashed before + // touching resources. + await instance.schema.table('pages', (t) => { + t.string('source').notNullable().defaultTo('crawled'); + }); + + await migratePagesResourcesSource(instance); + + expect(await instance.schema.hasColumn('pages', 'source')).toBe(true); + expect(await instance.schema.hasColumn('resources', 'source')).toBe(true); + + await instance.destroy(); + }); + + it('returns silently when neither pages nor resources table exists (fresh DB)', async () => { + const filename = path.resolve(workingDir, 'migrate-source-empty.sqlite'); + await fs.rm(filename, { force: true }); + const instance = knex({ + client: LibsqlDialect as never, + connection: { filename }, + useNullAsDefault: true, + }); + + await expect(migratePagesResourcesSource(instance)).resolves.toBeUndefined(); + + await instance.destroy(); + }); + + it('runs migration on resources only when pages table is absent', async () => { + const filename = path.resolve(workingDir, 'migrate-source-resources-only.sqlite'); + await fs.rm(filename, { force: true }); + const instance = knex({ + client: LibsqlDialect as never, + connection: { filename }, + useNullAsDefault: true, + }); + await instance.schema.createTable('resources', (t) => { + t.increments('id'); + t.string('url').notNullable().unique(); + }); + + await migratePagesResourcesSource(instance); + + expect(await instance.schema.hasColumn('resources', 'source')).toBe(true); + expect(await instance.schema.hasTable('pages')).toBe(false); + + await instance.destroy(); + }); + + it('runs migration on pages only when resources table is absent (symmetric case)', async () => { + const filename = path.resolve(workingDir, 'migrate-source-pages-only.sqlite'); + await fs.rm(filename, { force: true }); + const instance = knex({ + client: LibsqlDialect as never, + connection: { filename }, + useNullAsDefault: true, + }); + await instance.schema.createTable('pages', (t) => { + t.increments('id'); + t.string('url').notNullable().unique(); + t.boolean('scraped').notNullable().defaultTo(0); + t.boolean('isTarget').notNullable().defaultTo(0); + }); + + await migratePagesResourcesSource(instance); + + expect(await instance.schema.hasColumn('pages', 'source')).toBe(true); + expect(await instance.schema.hasTable('resources')).toBe(false); + + await instance.destroy(); + }); + + it('lets DEFAULT crawled fill in for INSERTs that omit the source column', async () => { + const { instance } = await buildLegacySchema( + 'migrate-source-default-backfill.sqlite', + ); + + await migratePagesResourcesSource(instance); + + // INSERT after migration without specifying source — DEFAULT must apply. + await instance('pages').insert({ + url: 'https://example.com/post-migrate', + scraped: 1, + isTarget: 0, + }); + const [pageRow] = await instance + .select('source') + .from('pages') + .where('url', 'https://example.com/post-migrate'); + expect(pageRow.source).toBe('crawled'); + + await instance.destroy(); + }); +}); diff --git a/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.ts b/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.ts new file mode 100644 index 0000000..061feed --- /dev/null +++ b/packages/@nitpicker/crawler/src/archive/migrate-pages-resources-source.ts @@ -0,0 +1,50 @@ +import type { Knex } from 'knex'; + +/** + * Add `pages.source` and `resources.source` columns (provenance taxonomy: + * `crawled` / `inventory-seed` / `inventory-discovered`) and their indexes + * to archives that pre-date the `crawl --inventory` feature. + * + * Idempotent: a no-op when both columns already exist. SQLite's + * `ALTER TABLE ADD COLUMN` with a NOT NULL DEFAULT applies the default to + * every existing row at column-add time, so no explicit `UPDATE` is needed + * to backfill — pre-existing rows become `'crawled'` automatically. + * + * Runs only on writer-side {@link Database.connect}; read-only viewer + * attaches skip this so the user's tmpDir is never rewritten. + * @param instance - The Knex query builder instance connected to the database. + */ +export async function migratePagesResourcesSource(instance: Knex): Promise { + const hasPages = await instance.schema.hasTable('pages'); + const hasResources = await instance.schema.hasTable('resources'); + if (!hasPages && !hasResources) { + return; + } + const hasPagesSource = hasPages && (await instance.schema.hasColumn('pages', 'source')); + const hasResourcesSource = + hasResources && (await instance.schema.hasColumn('resources', 'source')); + if (hasPagesSource && hasResourcesSource) { + return; + } + + const changes: string[] = []; + if (hasPages && !hasPagesSource) { + await instance.schema.table('pages', (t) => { + t.string('source').notNullable().defaultTo('crawled'); + t.index('source'); + }); + changes.push('pages.source added'); + } + if (hasResources && !hasResourcesSource) { + await instance.schema.table('resources', (t) => { + t.string('source').notNullable().defaultTo('crawled'); + t.index('source'); + }); + changes.push('resources.source added'); + } + if (changes.length === 0) { + return; + } + // eslint-disable-next-line no-console + console.error(`[migrate] ${changes.join(', ')}`); +} diff --git a/packages/@nitpicker/crawler/src/archive/types.ts b/packages/@nitpicker/crawler/src/archive/types.ts index 66d7a3c..bf0aa0c 100644 --- a/packages/@nitpicker/crawler/src/archive/types.ts +++ b/packages/@nitpicker/crawler/src/archive/types.ts @@ -51,6 +51,29 @@ export interface Config extends Required ignoreRobots: boolean; } +/** + * Provenance of a page or resource row — which crawler channel originally + * inserted it. Stored as `pages.source` / `resources.source` in the + * SQLite schema (NOT NULL DEFAULT `'crawled'`). + * + * - `'crawled'` — discovered via the recursive crawl rooted at `info.roots`. + * Default for pre-`--inventory` archives after the + * `migratePagesResourcesSource` runtime migration. + * - `'inventory-seed'` — supplied directly by a `crawl --inventory` URL + * list. For pages this is the HTML URL that was rendered; for resources + * this is a non-HTML URL handed in by the list (HEAD-fetched without + * rendering). + * - `'inventory-discovered'` — found by following links from an + * `inventory-seed` page, OR (for resources) loaded by puppeteer while + * rendering one of those pages. + * + * Used by the viewer as a badge and to indicate why a row was added. + * Isolation queries (`listIsolatedPages` / `listUnusedResources`) judge + * orphans by `referrer = 0`, NOT by this value — `source` only labels + * the row. + */ +export type PageSource = 'crawled' | 'inventory-seed' | 'inventory-discovered'; + /** * Filter type for querying pages from the database. * @@ -233,6 +256,8 @@ export interface DB_Page { skipReason: string | null; /** The natural URL sort order index, or null if not yet assigned. */ order: number | null; + /** Provenance of the row — see {@link PageSource}. */ + source: PageSource; } /** @@ -398,6 +423,8 @@ export interface DB_Resource { cdn: string | 0; /** JSON-serialized HTTP response headers, or null if not available. */ responseHeaders: string | null; + /** Provenance of the row — see {@link PageSource}. */ + source: PageSource; } /** diff --git a/packages/@nitpicker/crawler/src/crawler-orchestrator.ts b/packages/@nitpicker/crawler/src/crawler-orchestrator.ts index 8be9146..c44a362 100644 --- a/packages/@nitpicker/crawler/src/crawler-orchestrator.ts +++ b/packages/@nitpicker/crawler/src/crawler-orchestrator.ts @@ -1,5 +1,7 @@ import type { Config } from './archive/types.js'; +import type { InventoryMode } from './crawler/types.js'; import type { CrawlEvent } from './types.js'; +import type { PageData } from './utils/types/types.js'; import type { ExURL } from '@d-zero/shared/parse-url'; import { copyFile, unlink as unlinkFile } from 'node:fs/promises'; @@ -14,6 +16,9 @@ import pkg from '../package.json' with { type: 'json' }; import Archive from './archive/archive.js'; import { clearDestinationCache } from './crawler/clear-destination-cache.js'; import Crawler from './crawler/crawler.js'; +import { fetchDestination } from './crawler/fetch-destination.js'; +import { findScopeEntry } from './crawler/find-scope-entry.js'; +import { isHtmlContentType } from './crawler/is-html-content-type.js'; import { crawlerLog, log } from './debug.js'; import { normalizeToArray } from './normalize-to-array.js'; import { resolveOutputPath } from './resolve-output-path.js'; @@ -80,6 +85,13 @@ interface CrawlConfig extends Config { /** Whether to ignore robots.txt restrictions. */ ignoreRobots: boolean; + + /** + * Inventory-mode runtime configuration (see {@link InventoryMode}). Set + * by {@link CrawlerOrchestrator.inventory}; the default crawl path leaves + * this `null` so new rows are labelled `'crawled'` by the DB DEFAULT. + */ + inventoryMode: InventoryMode | null; } /** @@ -180,6 +192,11 @@ export class CrawlerOrchestrator extends EventEmitter { ); return row ? resourceRowToLookupResult(row) : null; }, + // Inventory mode is opted into by `CrawlerOrchestrator.inventory` + // (see T3); the default crawl path stays in normal mode so new + // rows continue to land in pages/resources with the DB DEFAULT + // `'crawled'` provenance label. + inventoryMode: options?.inventoryMode ?? null, }); } @@ -226,15 +243,15 @@ export class CrawlerOrchestrator extends EventEmitter { void this.emit('error', error); }); - this.#crawler.on('page', ({ result }) => { + this.#crawler.on('page', ({ result, source }) => { writeQueue - .enqueue(() => this.#archive.setPage(result)) + .enqueue(() => this.#archive.setPage(result, source)) .catch((error) => reject(error)); }); - this.#crawler.on('externalPage', ({ result }) => { + this.#crawler.on('externalPage', ({ result, source }) => { writeQueue - .enqueue(() => this.#archive.setExternalPage(result)) + .enqueue(() => this.#archive.setExternalPage(result, source)) .catch((error) => reject(error)); }); @@ -257,9 +274,9 @@ export class CrawlerOrchestrator extends EventEmitter { void this.emit('redirect', { result }); }); - this.#crawler.on('response', ({ resource }) => { + this.#crawler.on('response', ({ resource, source }) => { writeQueue - .enqueue(() => this.#archive.setResources(resource)) + .enqueue(() => this.#archive.setResources(resource, source)) .catch((error) => reject(error)); }); @@ -530,6 +547,290 @@ export class CrawlerOrchestrator extends EventEmitter { } } + /** + * Inventory mode: cross-reference a user-supplied URL list against an + * existing `.nitpicker` archive and import ONLY the URLs that are not yet + * tracked there. Designed to surface "orphan" landing pages that link + * graph traversal could not reach, and "unused" server-side files that + * no crawled page references — both of which the + * `listIsolatedPages` / `listUnusedResources` queries can then list. + * + * Flow: + * + * 1. Open the archive (writer mode, takes the archive lock). + * 2. Reject list-mode archives — they hold metadata-only rows that + * inventory has no business touching. + * 3. Reject archives with unfinished `pending` URLs — those would inherit + * the inventory `source` label by mistake. Operator must resume / + * retry-failed first. + * 4. Parse the URL list. Anything outside the archived scope is warned + * and skipped (inventory is per-server by design). + * 5. Subtract URLs that already exist in `pages` or `resources` so the + * second (and N-th) inventory pass is a no-op for known rows — keeps + * `'inventory-seed'` rows from being silently demoted. + * 6. Make `.bak`. Anything thrown beyond this point restores + * from the backup. + * 7. HEAD-probe each novel URL. Responses classified as HTML are queued + * as Crawler seeds (`'inventory-seed'`); everything else is recorded + * in `resources` directly as `'inventory-seed'` (no browser launch). + * 8. If any HTML seeds exist, start a Crawler with + * `inventoryMode = { seedUrls }` so the rendered page and every newly + * discovered downstream link is labelled correctly. `resume` is fed + * the existing `scraped` / `resources` sets so links into already- + * crawled pages stop at the seen-gate without re-rendering. + * 9. Drop the backup on success; restore it on any throw. + * + * Mutually exclusive with `--append` / `--retry-failed` / `--resume` / + * `--diff` / `--list` / `--list-file` / `--single` / `--output` — the + * CLI dispatch enforces this; this method assumes the caller honoured + * the contract. + * @param archivePath - Absolute or cwd-relative path to the `.nitpicker` archive. + * @param inventoryUrls - Pre-read URL list (one URL per element). + * @param options - Optional config overrides — most callers leave this blank and let the archived config flow through. + * @param initializedCallback - Hook invoked once the orchestrator is constructed but before `crawling` runs (the CLI uses it to attach progress reporting). + * @returns The orchestrator instance after a successful inventory pass. + * @throws {Error} When `inventoryUrls` is empty, the archive is in list mode, or pending URLs from a previous crawl remain unresolved. + */ + static async inventory( + archivePath: string, + inventoryUrls: string[], + options?: Partial, + initializedCallback?: CrawlInitializedCallback, + ) { + if (inventoryUrls.length === 0) { + throw new Error('inventory: URL list is empty'); + } + const cwd = options?.cwd ?? process.cwd(); + const absFilePath = path.isAbsolute(archivePath) + ? archivePath + : path.resolve(cwd, archivePath); + + const archive = await Archive.open({ filePath: absFilePath, cwd }); + try { + const archived = await archive.getConfig(); + if (archived.fromList) { + throw new Error( + 'Cannot run inventory on a list-mode archive: this archive was created with --list/--list-file and contains metadata-only pages. Create a fresh archive instead.', + ); + } + + const { scraped, pending } = await archive.getCrawlingState(); + if (pending.length > 0) { + throw new Error( + `inventory: archive has ${pending.length} pending URLs from a previous crawl. Resume or retry-failed first so inventory does not mislabel them as 'inventory-discovered'.`, + ); + } + + // Parse + scope-classify the candidate URLs. sortUrl drops + // unparseable strings; findScopeEntry separates in-scope from + // out-of-scope. + const parsedAll = sortUrl(inventoryUrls, archived); + const scopeMap = new Map(); + for (const raw of archived.roots) { + const parsed = parseUrl(raw, archived); + if (!parsed) continue; + const existing = scopeMap.get(parsed.hostname) ?? []; + scopeMap.set(parsed.hostname, [...existing, parsed]); + } + const inScope: ExURL[] = []; + let outOfScope = 0; + for (const url of parsedAll) { + if (findScopeEntry(url, scopeMap, archived) === null) { + outOfScope++; + } else { + inScope.push(url); + } + } + if (outOfScope > 0) { + log( + '[inventory] %d URL(s) skipped (outside archived scope: %O)', + outOfScope, + archived.roots, + ); + } + + // Drop URLs that are already represented in the archive (either + // as pages or resources). Comparison key is `withoutHashAndAuth` + // to mirror what `#getIdByUrl` / `insertResource` actually store. + // Two independent reads — Promise.all halves the wait on large + // archives where each `WHERE url IN (?)` chunk costs real I/O. + const candidateUrls = inScope.map((u) => u.withoutHashAndAuth); + const [existingPageUrlList, existingResourceUrlList] = await Promise.all([ + archive.getExistingPageUrls(candidateUrls), + archive.getExistingResourceUrls(candidateUrls), + ]); + const existingPageUrls = new Set(existingPageUrlList); + const existingResourceUrls = new Set(existingResourceUrlList); + const novelUrls = inScope.filter((u) => { + const key = u.withoutHashAndAuth; + return !existingPageUrls.has(key) && !existingResourceUrls.has(key); + }); + const knownCount = existingPageUrls.size + existingResourceUrls.size; + log( + '[inventory] %d in-scope, %d already in archive, %d new', + inScope.length, + knownCount, + novelUrls.length, + ); + + if (novelUrls.length === 0) { + // Nothing to do — release the archive cleanly without taking a + // backup. The orchestrator returned here is empty; the caller + // should only invoke `close` on it. + const noopConfig: Config = { + ...archived, + ...cleanObject(options), + }; + const orchestrator = new CrawlerOrchestrator(archive, noopConfig); + if (initializedCallback) { + await initializedCallback(orchestrator, noopConfig); + } + return orchestrator; + } + + const backupPath = absFilePath + '.bak'; + await copyFile(absFilePath, backupPath); + + try { + // HEAD each novel URL once. HTML responses become seeds for + // the recursive crawl; everything else is recorded straight + // into `resources` so a PDF or stray asset registered on the + // server still shows up as "exists but not referenced" in + // `listUnusedResources`. + // + // Probes run concurrently — fetchDestination is a single + // HTTP HEAD with no shared mutable state, so N URLs no longer + // cost N × HEAD-latency wall-clock. The trade-off is that we + // blast the target server with novelUrls.length parallel + // requests; for the inventory use case (one-off audit on a + // site we control) this is acceptable, and an internal cap + // can be added later if it becomes an issue. + type HeadResult = + | { url: ExURL; head: PageData; error: null } + | { url: ExURL; head: null; error: Error }; + const headResults: HeadResult[] = await Promise.all( + novelUrls.map(async (url): Promise => { + try { + const head = await fetchDestination({ + url, + isExternal: false, + userAgent: archived.userAgent, + }); + return { url, head, error: null }; + } catch (headError) { + const error = + headError instanceof Error ? headError : new Error(String(headError)); + return { url, head: null, error }; + } + }), + ); + + const htmlSeeds: ExURL[] = []; + for (const result of headResults) { + const { url, head, error } = result; + if (error !== null) { + // HEAD failure is recorded as a crawl_errors row so + // the URL is visible in `query error-kinds`, but does + // NOT abort the whole inventory pass — other novel + // URLs may still succeed. + await archive.addError({ + pid: process.pid, + isMainProcess: true, + url: url.href, + isExternal: false, + error, + }); + continue; + } + if (head.contentType == null || isHtmlContentType(head.contentType)) { + htmlSeeds.push(url); + } else { + await archive.setResources( + { + url, + isExternal: false, + isError: false, + status: head.status, + statusText: head.statusText, + contentType: head.contentType, + contentLength: head.contentLength, + compress: false, + cdn: false, + headers: head.responseHeaders ?? null, + }, + 'inventory-seed', + ); + } + } + + // Config sent to the user-facing `initializedCallback` + // (matches the rest of the orchestrator's public surface — + // no inventory bookkeeping leaks out). + const baseConfig: Config = { + ...archived, + ...cleanObject(options), + recursive: true, + fromList: false, + }; + const seedSet = new Set(htmlSeeds.map((u) => u.withoutHashAndAuth)); + // CrawlConfig overlay handed to the orchestrator constructor — + // carries the runtime-only `inventoryMode` that drives source + // labelling. Not persisted to the archive. + const orchestratorOptions: Partial = { + ...baseConfig, + inventoryMode: { seedUrls: seedSet }, + }; + if (htmlSeeds.length > 0) { + const orchestrator = new CrawlerOrchestrator(archive, orchestratorOptions); + const resources = await archive.getResourceUrlList(); + // Empty pending (we rejected non-empty above) but feed + // every already-scraped URL into `seen` so the Crawler's + // link enqueueing path drops links that hit a known + // page without re-rendering it. + orchestrator.#crawler.resume(pending, scraped, resources, 0); + if (initializedCallback) { + await initializedCallback(orchestrator, baseConfig); + } + log('Start inventory'); + log('Archive %s', absFilePath); + log( + 'HTML seeds %O', + htmlSeeds.map((u) => u.href), + ); + await orchestrator.crawling(htmlSeeds, { recursive: true }); + clearDestinationCache(); + await archive.setUrlOrder(); + await ignoreEnoent(unlinkFile(backupPath)); + return orchestrator; + } + + // Only non-HTML URLs were imported — nothing left to render, + // but still update sort order and finalize. + const orchestrator = new CrawlerOrchestrator(archive, orchestratorOptions); + if (initializedCallback) { + await initializedCallback(orchestrator, baseConfig); + } + await archive.setUrlOrder(); + await ignoreEnoent(unlinkFile(backupPath)); + return orchestrator; + } catch (error) { + try { + await copyFile(backupPath, absFilePath); + await ignoreEnoent(unlinkFile(backupPath)); + } catch (restoreError) { + throw new AggregateError( + [error, restoreError], + `inventory failed AND restore from backup failed. Original archive backup is left at: ${backupPath}`, + ); + } + throw error; + } + } catch (error) { + await archive.close().catch(() => {}); + throw error; + } + } + /** * Re-fetch previously-failed pages in an existing `.nitpicker` archive. * diff --git a/packages/@nitpicker/crawler/src/crawler/crawler.ts b/packages/@nitpicker/crawler/src/crawler/crawler.ts index 22d5460..d863fcf 100644 --- a/packages/@nitpicker/crawler/src/crawler/crawler.ts +++ b/packages/@nitpicker/crawler/src/crawler/crawler.ts @@ -26,6 +26,8 @@ import pkg from '../../package.json' with { type: 'json' }; import { crawlerLog } from '../debug.js'; import { createChangePhaseHandler } from './create-change-phase-handler.js'; +import { derivePageSource } from './derive-page-source.js'; +import { deriveResourceSource } from './derive-resource-source.js'; import { detectPaginationPattern } from './detect-pagination-pattern.js'; import { drainPhaseErrors } from './drain-phase-errors.js'; import { fetchDestination } from './fetch-destination.js'; @@ -137,6 +139,7 @@ export default class Crawler extends EventEmitter { userAgent: options?.userAgent || `Nitpicker/${pkg.version}`, ignoreRobots: options?.ignoreRobots ?? false, lookupResource: options?.lookupResource ?? null, + inventoryMode: options?.inventoryMode ?? null, }; this.#robotsChecker = new RobotsChecker( @@ -326,6 +329,11 @@ export default class Crawler extends EventEmitter { * @param resources - Sub-resource entries captured during the page load */ #handleResources(resources: ResourceEntry[]) { + // `deriveResourceSource` encodes the "sub-resources are never seeds" + // rule and stays in lockstep with `derivePageSource` if PageSource + // gains new variants. Computed once outside the loop because the + // inventoryMode reference does not change mid-batch. + const subResourceSource = deriveResourceSource(this.#options.inventoryMode); for (const { resource, pageUrl } of resources) { const { isNew } = handleResourceResponse( resource as CrawlerEventTypes['response']['resource'], @@ -334,6 +342,7 @@ export default class Crawler extends EventEmitter { if (isNew) { void this.emit('response', { resource: resource as CrawlerEventTypes['response']['resource'], + source: subResourceSource, }); } void this.emit('responseReferrers', { @@ -427,10 +436,22 @@ export default class Crawler extends EventEmitter { paginationState.lastPushedWasPredicted = false; }, ); - if (result.pageData.isExternal) { - void this.emit('externalPage', { result: result.pageData }); - } else { - void this.emit('page', { result: result.pageData }); + { + const pageSource = derivePageSource( + this.#options.inventoryMode, + result.pageData.url.withoutHashAndAuth, + ); + if (result.pageData.isExternal) { + void this.emit('externalPage', { + result: result.pageData, + source: pageSource, + }); + } else { + void this.emit('page', { + result: result.pageData, + source: pageSource, + }); + } } break; } @@ -468,10 +489,17 @@ export default class Crawler extends EventEmitter { ); const isExternal = findScopeEntry(url, this.#scope, this.#options) === null; if (pageResult) { + const pageSource = derivePageSource( + this.#options.inventoryMode, + pageResult.url.withoutHashAndAuth, + ); if (pageResult.isExternal) { - void this.emit('externalPage', { result: pageResult }); + void this.emit('externalPage', { + result: pageResult, + source: pageSource, + }); } else { - void this.emit('page', { result: pageResult }); + void this.emit('page', { result: pageResult, source: pageSource }); } } void this.emit('error', { @@ -703,7 +731,13 @@ export default class Crawler extends EventEmitter { isLowerLayer: false, }); this.#linkList.done(url, this.#scope, { page: pageData }, this.#options); - void this.emit('externalPage', { result: pageData }); + void this.emit('externalPage', { + result: pageData, + source: derivePageSource( + this.#options.inventoryMode, + url.withoutHashAndAuth, + ), + }); log(c.dim('External (skip fetch)')); return; } diff --git a/packages/@nitpicker/crawler/src/crawler/derive-page-source.spec.ts b/packages/@nitpicker/crawler/src/crawler/derive-page-source.spec.ts new file mode 100644 index 0000000..321cf5f --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/derive-page-source.spec.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from 'vitest'; + +import { derivePageSource } from './derive-page-source.js'; + +describe('derivePageSource', () => { + it('returns undefined outside inventory mode (DB DEFAULT crawled applies)', () => { + expect(derivePageSource(null, 'https://example.com/foo')).toBeUndefined(); + }); + + it('labels URLs present in seedUrls as inventory-seed', () => { + const seedUrls = new Set([ + 'https://example.com/seed-a', + 'https://example.com/seed-b', + ]); + expect(derivePageSource({ seedUrls }, 'https://example.com/seed-a')).toBe( + 'inventory-seed', + ); + }); + + it('labels URLs not in seedUrls as inventory-discovered', () => { + const seedUrls = new Set(['https://example.com/seed-a']); + expect(derivePageSource({ seedUrls }, 'https://example.com/derived')).toBe( + 'inventory-discovered', + ); + }); + + it('membership is exact-string — query/auth differences are NOT collapsed here', () => { + // The caller passes the `withoutHashAndAuth` form, so query strings + // remain part of the key. Ambiguity belongs in the caller (Crawler), + // not in this pure helper. + const seedUrls = new Set(['https://example.com/page?lang=ja']); + expect(derivePageSource({ seedUrls }, 'https://example.com/page?lang=en')).toBe( + 'inventory-discovered', + ); + expect(derivePageSource({ seedUrls }, 'https://example.com/page?lang=ja')).toBe( + 'inventory-seed', + ); + }); +}); diff --git a/packages/@nitpicker/crawler/src/crawler/derive-page-source.ts b/packages/@nitpicker/crawler/src/crawler/derive-page-source.ts new file mode 100644 index 0000000..f083659 --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/derive-page-source.ts @@ -0,0 +1,34 @@ +import type { InventoryMode } from './types.js'; +import type { PageSource } from '../archive/types.js'; + +/** + * Decide which {@link PageSource} label a newly-scraped page row should carry. + * + * When the crawler is NOT in inventory mode (`inventoryMode === null`), + * returns `undefined` — the caller emits no `source` and the DB DEFAULT + * `'crawled'` ends up on the row. This keeps the normal crawl path + * untouched. + * + * When inventory mode is active, the URL is matched against + * `inventoryMode.seedUrls`. A hit means the URL came straight from the + * user-supplied list (`'inventory-seed'`); a miss means the URL was found + * by following links from a seed page (`'inventory-discovered'`). + * + * Sub-resources captured by puppeteer during inventory-mode rendering are + * NEVER seeds — the caller for those events always passes + * `'inventory-discovered'` directly without consulting this helper. + * @param inventoryMode - Inventory-mode config from `CrawlerOptions.inventoryMode`, or `null` outside `--inventory`. + * @param pageUrlWithoutHashAndAuth - The page URL keyed by `withoutHashAndAuth` (auth credentials stripped, hash dropped). + * @returns The label to write to `pages.source`, or `undefined` for the DB default. + */ +export function derivePageSource( + inventoryMode: InventoryMode | null, + pageUrlWithoutHashAndAuth: string, +): PageSource | undefined { + if (inventoryMode === null) { + return undefined; + } + return inventoryMode.seedUrls.has(pageUrlWithoutHashAndAuth) + ? 'inventory-seed' + : 'inventory-discovered'; +} diff --git a/packages/@nitpicker/crawler/src/crawler/derive-resource-source.spec.ts b/packages/@nitpicker/crawler/src/crawler/derive-resource-source.spec.ts new file mode 100644 index 0000000..a904e0e --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/derive-resource-source.spec.ts @@ -0,0 +1,23 @@ +import { describe, expect, it } from 'vitest'; + +import { deriveResourceSource } from './derive-resource-source.js'; + +describe('deriveResourceSource', () => { + it('returns undefined outside inventory mode (DB DEFAULT crawled applies)', () => { + expect(deriveResourceSource(null)).toBeUndefined(); + }); + + it('always returns inventory-discovered when inventory mode is active', () => { + const seedUrls = new Set(['https://example.com/seed']); + // Sub-resources are never themselves seeds — the rule is independent + // of which seed URL triggered the rendering, so the helper does not + // even look at the resource URL. + expect(deriveResourceSource({ seedUrls })).toBe('inventory-discovered'); + }); + + it('ignores the seed set contents — never returns inventory-seed', () => { + // Even an empty seed set yields inventory-discovered, mirroring the + // `derivePageSource` contract that membership only matters for pages. + expect(deriveResourceSource({ seedUrls: new Set() })).toBe('inventory-discovered'); + }); +}); diff --git a/packages/@nitpicker/crawler/src/crawler/derive-resource-source.ts b/packages/@nitpicker/crawler/src/crawler/derive-resource-source.ts new file mode 100644 index 0000000..a7e0b65 --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/derive-resource-source.ts @@ -0,0 +1,31 @@ +import type { InventoryMode } from './types.js'; +import type { PageSource } from '../archive/types.js'; + +/** + * Decide which {@link PageSource} label a newly-captured sub-resource row + * (CSS / JS / image / fetch response) should carry. + * + * Sub-resources are NEVER themselves seeds — even when puppeteer is + * rendering a page that *is* an inventory seed, the assets it pulls in + * are downstream and must be labelled `'inventory-discovered'`. The seed + * label is reserved for URLs that were explicitly handed in by the user + * via the `--inventory` file. + * + * Outside inventory mode (`inventoryMode === null`) this returns + * `undefined` so the caller emits no `source` and the DB DEFAULT + * (`'crawled'`) lands on the row — keeps the normal crawl path + * untouched. This is the sub-resource counterpart of + * {@link import('./derive-page-source.js').derivePageSource}; the two + * helpers exist as a pair so a future addition to {@link PageSource} + * forces a parallel update. + * @param inventoryMode - Inventory-mode config from `CrawlerOptions.inventoryMode`, or `null` outside `--inventory`. + * @returns The label to write to `resources.source`, or `undefined` for the DB default. + */ +export function deriveResourceSource( + inventoryMode: InventoryMode | null, +): PageSource | undefined { + if (inventoryMode === null) { + return undefined; + } + return 'inventory-discovered'; +} diff --git a/packages/@nitpicker/crawler/src/crawler/types.ts b/packages/@nitpicker/crawler/src/crawler/types.ts index f15aac7..36ac96e 100644 --- a/packages/@nitpicker/crawler/src/crawler/types.ts +++ b/packages/@nitpicker/crawler/src/crawler/types.ts @@ -1,3 +1,4 @@ +import type { PageSource } from '../archive/types.js'; import type { PageData, CrawlerError, Resource } from '../utils/types/types.js'; import type { ChangePhaseEvent, ScrapeResult } from '@d-zero/beholder'; import type { ParseURLOptions } from '@d-zero/shared/parse-url'; @@ -85,6 +86,30 @@ export interface CrawlerOptions extends Required< * resource-reuse optimization. See {@link ResourceLookup}. */ lookupResource: ResourceLookup | null; + + /** + * When non-null, the crawler is running in `--inventory` mode. New page + * rows whose URL matches `seedUrls` are labelled `'inventory-seed'`; + * every other newly-inserted page or sub-resource is labelled + * `'inventory-discovered'`. When `null`, no source label is emitted — + * the DB DEFAULT `'crawled'` applies. + */ + inventoryMode: InventoryMode | null; +} + +/** + * Inventory-mode runtime configuration. Passed from + * `CrawlerOrchestrator.inventory` into the Crawler so the emit pipeline can + * label new rows with the correct {@link PageSource}. + */ +export interface InventoryMode { + /** + * URLs explicitly listed in the user-supplied URL file, keyed by their + * `withoutHashAndAuth` form (so credentials in the URL don't break the + * match). Membership decides `inventory-seed` vs `inventory-discovered` + * for HTML pages. + */ + seedUrls: ReadonlySet; } /** @@ -147,6 +172,12 @@ export interface CrawlerEventTypes { page: { /** The scraped page data including HTML, metadata, anchors, and images. */ result: PageData; + /** + * Inventory provenance to write to `pages.source` when this row is new. + * `undefined` means the DB default (`'crawled'`) applies, which is the + * common case outside `crawl --inventory`. See {@link PageSource}. + */ + source?: PageSource; }; /** @@ -155,6 +186,8 @@ export interface CrawlerEventTypes { externalPage: { /** The scraped page data for the external page. */ result: PageData; + /** Inventory provenance for new rows — see {@link CrawlerEventTypes.page.source}. */ + source?: PageSource; }; /** @@ -176,6 +209,14 @@ export interface CrawlerEventTypes { response: { /** The captured resource data including URL, status, content type, and headers. */ resource: Resource; + /** + * Inventory provenance to write to `resources.source` when this row is new. + * Sub-resources discovered while puppeteer renders an inventory-seed + * page are always `'inventory-discovered'` (a sub-resource is never + * itself a seed). `undefined` means the DB default (`'crawled'`) + * applies. See {@link PageSource}. + */ + source?: PageSource; }; /** diff --git a/packages/@nitpicker/mcp-server/src/mcp-server.spec.ts b/packages/@nitpicker/mcp-server/src/mcp-server.spec.ts index 99b8b5f..566649b 100644 --- a/packages/@nitpicker/mcp-server/src/mcp-server.spec.ts +++ b/packages/@nitpicker/mcp-server/src/mcp-server.spec.ts @@ -221,9 +221,9 @@ describe('createServer', () => { rmSync(workingDir, { recursive: true, force: true }); }); - it('ListTools で22個のツールが返される (v2: + 8 new Wappalyzer / JSON-LD tools)', async () => { + it('ListTools で24個のツールが返される (v2: + 8 new Wappalyzer / JSON-LD tools, + 2 inventory tools)', async () => { const result = await listTools(server); - expect(result.tools).toHaveLength(22); + expect(result.tools).toHaveLength(24); const names = result.tools.map((t) => t.name); expect(names).toContain('open_archive'); expect(names).toContain('close_archive'); @@ -237,6 +237,9 @@ describe('createServer', () => { expect(names).toContain('count_pages_by_tag'); expect(names).toContain('count_pages_by_jsonld_type'); expect(names).toContain('get_page_jsonld_overview'); + // New in inventory feature: + expect(names).toContain('list_isolated_pages'); + expect(names).toContain('list_unused_resources'); }); it('toolDefinitions の数と ListTools の数が一致する', async () => { diff --git a/packages/@nitpicker/mcp-server/src/mcp-server.ts b/packages/@nitpicker/mcp-server/src/mcp-server.ts index da0deff..4958e2d 100644 --- a/packages/@nitpicker/mcp-server/src/mcp-server.ts +++ b/packages/@nitpicker/mcp-server/src/mcp-server.ts @@ -21,11 +21,13 @@ import { getTagInventory, getViolations, listImages, + listIsolatedPages, listLinks, listPages, listPagesByJsonLdType, listPagesByTag, listResources, + listUnusedResources, } from '@nitpicker/query'; import { toolDefinitions } from './tool-definitions.js'; @@ -269,6 +271,24 @@ export function createServer() { const accessor = manager.get(requireString(args, 'archiveId')); return jsonResult(await checkHeaders(accessor, omit(args, 'archiveId'))); } + case 'list_isolated_pages': { + const accessor = manager.get(requireString(args, 'archiveId')); + return jsonResult( + await listIsolatedPages(accessor, { + limit: optionalNumber(args, 'limit'), + offset: optionalNumber(args, 'offset'), + }), + ); + } + case 'list_unused_resources': { + const accessor = manager.get(requireString(args, 'archiveId')); + return jsonResult( + await listUnusedResources(accessor, { + limit: optionalNumber(args, 'limit'), + offset: optionalNumber(args, 'offset'), + }), + ); + } case 'list_pages_by_tag': { const accessor = manager.get(requireString(args, 'archiveId')); return jsonResult( diff --git a/packages/@nitpicker/mcp-server/src/tool-definitions.ts b/packages/@nitpicker/mcp-server/src/tool-definitions.ts index b190d25..1a76064 100644 --- a/packages/@nitpicker/mcp-server/src/tool-definitions.ts +++ b/packages/@nitpicker/mcp-server/src/tool-definitions.ts @@ -509,4 +509,38 @@ export const toolDefinitions: Tool[] = [ required: ['archiveId'], }, }, + { + name: 'list_isolated_pages', + description: + 'List internal HTML pages that no other archived page anchors to (excluding archived roots). These are the "orphan landing pages" the recursive crawl could not reach via the link graph. Combine with `crawl --inventory` to surface pages that only exist on the server but are unreachable from the site map. Each row carries a `source` badge (`crawled` / `inventory-seed` / `inventory-discovered`) indicating how the page entered the archive.', + inputSchema: { + type: 'object' as const, + properties: { + archiveId: { + type: 'string', + description: 'The archive ID returned by open_archive', + }, + limit: { type: 'number', description: 'Max results (default: 100)' }, + offset: { type: 'number', description: 'Results to skip (default: 0)' }, + }, + required: ['archiveId'], + }, + }, + { + name: 'list_unused_resources', + description: + 'List internal sub-resources that no archived page references — candidates for deletion from the server. Each row carries a `source` badge so callers can distinguish files registered via `crawl --inventory` (no page ever loaded them) from files that were once referenced but lost their last referrer.', + inputSchema: { + type: 'object' as const, + properties: { + archiveId: { + type: 'string', + description: 'The archive ID returned by open_archive', + }, + limit: { type: 'number', description: 'Max results (default: 100)' }, + offset: { type: 'number', description: 'Results to skip (default: 0)' }, + }, + required: ['archiveId'], + }, + }, ]; diff --git a/packages/@nitpicker/query/src/list-isolated-pages.spec.ts b/packages/@nitpicker/query/src/list-isolated-pages.spec.ts new file mode 100644 index 0000000..0c00ea7 --- /dev/null +++ b/packages/@nitpicker/query/src/list-isolated-pages.spec.ts @@ -0,0 +1,245 @@ +import path from 'node:path'; + +import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url'; +import { Archive } from '@nitpicker/crawler'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import { listIsolatedPages } from './list-isolated-pages.js'; + +const __filename = new URL(import.meta.url).pathname; +const __dirname = path.dirname(__filename); +const workingDir = path.resolve(__dirname, '__test_fixtures_list_isolated_pages__'); + +/** + * Minimal Meta object for `setPage`, mirroring what beholder produces for + * pages with no `` tags. Spelled out here so each test reads as + * "isolation depends on link graph, not metadata". + */ +const EMPTY_META = { + lang: null, + title: null, + description: null, + keywords: null, + noindex: false, + nofollow: false, + noarchive: false, + canonical: null, + alternate: null, + 'og:type': null, + 'og:title': null, + 'og:site_name': null, + 'og:description': null, + 'og:url': null, + 'og:image': null, + 'twitter:card': null, +}; + +describe('listIsolatedPages', () => { + let archive: InstanceType; + const archiveFilePath = path.resolve(workingDir, 'isolated-pages-test.nitpicker'); + + beforeAll(async () => { + const { mkdirSync } = await import('node:fs'); + mkdirSync(workingDir, { recursive: true }); + + archive = await Archive.create({ + filePath: archiveFilePath, + cwd: workingDir, + }); + + await archive.setConfig({ + baseUrl: 'https://example.com', + name: 'test', + version: '0.10.0', + recursive: true, + interval: 0, + image: true, + fetchExternal: false, + parallels: 1, + roots: ['https://example.com'], + excludes: [], + excludeKeywords: [], + excludeUrls: [], + maxExcludedDepth: 0, + retry: 3, + fromList: false, + disableQueries: false, + userAgent: 'test', + ignoreRobots: false, + }); + + // Root page — linked from nothing, BUT is an archived root, so it must + // NOT be reported as isolated (roots are seeds by definition). + await archive.setPage({ + url: parseUrl('https://example.com')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'Home' }, + anchorList: [ + { + href: parseUrl('https://example.com/about')!, + isExternal: false, + title: null, + textContent: 'About', + hash: null, + }, + ], + imageList: [], + isSkipped: false, + }); + + // About — linked from Home, so NOT isolated. + await archive.setPage({ + url: parseUrl('https://example.com/about')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'About' }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // Hidden — no inbound anchors, not a root. EXPECTED isolated row. + await archive.setPage({ + url: parseUrl('https://example.com/hidden')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'Hidden LP' }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // External page — no inbound anchors, not a root, but isExternal=1. + // MUST be filtered out (inventory targets in-scope pages only). + await archive.setExternalPage({ + url: parseUrl('https://external.example.net/page')!, + redirectPaths: [], + isExternal: true, + isTarget: false, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'External Page' }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // Inventory-seed page — no inbound anchors, not a root. MUST appear + // with source='inventory-seed' on the row. + await archive.setPage( + { + url: parseUrl('https://example.com/inventory-seed-page')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'Inventory Seed Page' }, + anchorList: [], + imageList: [], + isSkipped: false, + }, + 'inventory-seed', + ); + + // Inventory-discovered page — same isolation status, but `source` label + // must come through differently. + await archive.setPage( + { + url: parseUrl('https://example.com/inventory-discovered-page')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...EMPTY_META, title: 'Inventory Discovered Page' }, + anchorList: [], + imageList: [], + isSkipped: false, + }, + 'inventory-discovered', + ); + }); + + afterAll(async () => { + if (archive) { + await archive.close(); + } + const { rmSync } = await import('node:fs'); + rmSync(workingDir, { recursive: true, force: true }); + }); + + it('reports internal HTML pages with no inbound anchors, excluding archived roots and external pages', async () => { + const result = await listIsolatedPages(archive); + + // Expected: /hidden + /inventory-seed-page + /inventory-discovered-page + // (3 internal, no-inbound, non-root pages). + // Excluded: / (root), /about (linked), external page. + const urls = result.items.map((row) => row.url); + expect(urls.toSorted()).toEqual([ + 'https://example.com/hidden', + 'https://example.com/inventory-discovered-page', + 'https://example.com/inventory-seed-page', + ]); + expect(urls).not.toContain('https://example.com'); + expect(urls).not.toContain('https://example.com/about'); + expect(urls).not.toContain('https://external.example.net/page'); + }); + + it('returns the source badge from the DB column (crawled / inventory-seed / inventory-discovered)', async () => { + const result = await listIsolatedPages(archive); + const bySource: Record = {}; + for (const row of result.items) { + bySource[row.url] = row.source; + } + expect(bySource['https://example.com/hidden']).toBe('crawled'); + expect(bySource['https://example.com/inventory-seed-page']).toBe('inventory-seed'); + expect(bySource['https://example.com/inventory-discovered-page']).toBe( + 'inventory-discovered', + ); + }); + + it('respects limit and offset across the full isolated set', async () => { + // 3 isolated rows total. Walk the pagination manually. + const first = await listIsolatedPages(archive, { limit: 2, offset: 0 }); + expect(first.items).toHaveLength(2); + const second = await listIsolatedPages(archive, { limit: 2, offset: 2 }); + expect(second.items).toHaveLength(1); + const third = await listIsolatedPages(archive, { limit: 2, offset: 3 }); + expect(third.items).toHaveLength(0); + }); +}); diff --git a/packages/@nitpicker/query/src/list-isolated-pages.ts b/packages/@nitpicker/query/src/list-isolated-pages.ts new file mode 100644 index 0000000..bbac89d --- /dev/null +++ b/packages/@nitpicker/query/src/list-isolated-pages.ts @@ -0,0 +1,84 @@ +import type { IsolatedPageEntry, ListIsolatedPagesOptions, PageSource } from './types.js'; +import type { ArchiveAccessor } from '@nitpicker/crawler'; +import type { Knex } from 'knex'; + +/** + * List internal HTML pages that no other page links to — "orphan" pages + * not reachable from the recursive crawl graph. + * + * Isolation is judged purely by the link graph: `anchors.hrefId IS NULL` + * means no page anchors point at this row. The `pages.source` value is + * IGNORED in the WHERE clause and returned only as a per-row badge, so + * orphans first discovered by `crawl --inventory` and orphans discovered + * by the original crawl both surface here equally. + * + * Archived roots (`info.roots`) are excluded — those are seeds by + * definition and would otherwise dominate the result set. Only HTML + * pages count: PDFs / images / non-HTML rows are out of scope (consumers + * who care about unused non-HTML assets use + * {@link import('./list-unused-resources.js').listUnusedResources} + * instead). + * + * Read-only — safe against viewer / stub-mode archives. + * @param accessor - The archive accessor to query. + * @param options - Pagination options. + * @returns Paginated list of isolated pages with their `source` badge. + */ +export async function listIsolatedPages( + accessor: ArchiveAccessor, + options: ListIsolatedPagesOptions = {}, +): Promise<{ items: IsolatedPageEntry[]; total: number }> { + const knex = accessor.getKnex(); + const limit = options.limit ?? 100; + const offset = options.offset ?? 0; + + // Pull archived roots so they can be excluded — they are seeds by + // definition and would otherwise drown out real orphans. + const infoRow = (await knex('info').select('roots').first()) as + | { roots: string | null } + | undefined; + const roots: string[] = infoRow?.roots ? (JSON.parse(infoRow.roots) as string[]) : []; + + const baseWhere = (qb: Knex.QueryBuilder): Knex.QueryBuilder => { + const filtered = qb + .leftJoin('anchors', 'pages.id', '=', 'anchors.hrefId') + .whereNull('anchors.id') + .where({ + 'pages.scraped': 1, + 'pages.isExternal': 0, + 'pages.contentType': 'text/html', + }) + .whereNull('pages.redirectDestId'); + return roots.length > 0 ? filtered.whereNotIn('pages.url', roots) : filtered; + }; + + const countResult = (await baseWhere(knex('pages')).count('pages.id as total')) as { + total: number; + }[]; + const total = countResult[0]?.total ?? 0; + + const rows = (await baseWhere(knex('pages')) + .select('pages.url', 'pages.status', 'pages.title', 'pages.source') + .orderBy('pages.url') + .limit(limit) + .offset(offset)) as { + url: string; + status: number | null; + title: string | null; + source: string | null; + }[]; + + const items: IsolatedPageEntry[] = rows.map((row) => ({ + url: row.url, + title: row.title, + status: row.status, + // Tolerate pre-migration archives where the column is absent — + // `?? 'crawled'` mirrors the DB DEFAULT. + source: (row.source ?? 'crawled') as PageSource, + })); + + return { + items, + total: Number(total), + }; +} diff --git a/packages/@nitpicker/query/src/list-unused-resources.spec.ts b/packages/@nitpicker/query/src/list-unused-resources.spec.ts new file mode 100644 index 0000000..079fdd4 --- /dev/null +++ b/packages/@nitpicker/query/src/list-unused-resources.spec.ts @@ -0,0 +1,208 @@ +import path from 'node:path'; + +import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url'; +import { Archive } from '@nitpicker/crawler'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import { listUnusedResources } from './list-unused-resources.js'; + +const __filename = new URL(import.meta.url).pathname; +const __dirname = path.dirname(__filename); +const workingDir = path.resolve(__dirname, '__test_fixtures_list_unused_resources__'); + +describe('listUnusedResources', () => { + let archive: InstanceType; + const archiveFilePath = path.resolve(workingDir, 'unused-resources-test.nitpicker'); + + beforeAll(async () => { + const { mkdirSync } = await import('node:fs'); + mkdirSync(workingDir, { recursive: true }); + + archive = await Archive.create({ + filePath: archiveFilePath, + cwd: workingDir, + }); + + await archive.setConfig({ + baseUrl: 'https://example.com', + name: 'test', + version: '0.10.0', + recursive: true, + interval: 0, + image: true, + fetchExternal: false, + parallels: 1, + roots: ['https://example.com'], + excludes: [], + excludeKeywords: [], + excludeUrls: [], + maxExcludedDepth: 0, + retry: 3, + fromList: false, + disableQueries: false, + userAgent: 'test', + ignoreRobots: false, + }); + + // One internal resource that NO page references (the canonical + // "unused" case). + await archive.setResources({ + url: parseUrl('https://example.com/orphan.pdf')!, + isExternal: false, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'application/pdf', + contentLength: 1000, + compress: false, + cdn: false, + headers: {}, + }); + + // One referenced resource — must NOT appear in the result. + await archive.setResources({ + url: parseUrl('https://example.com/used.css')!, + isExternal: false, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'text/css', + contentLength: 500, + compress: false, + cdn: false, + headers: {}, + }); + // Register a referrer for used.css — needs a page row first. + await archive.setPage({ + url: parseUrl('https://example.com/page-using-css')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { + lang: null, + title: 'Page with CSS', + description: null, + keywords: null, + noindex: false, + nofollow: false, + noarchive: false, + canonical: null, + alternate: null, + 'og:type': null, + 'og:title': null, + 'og:site_name': null, + 'og:description': null, + 'og:url': null, + 'og:image': null, + 'twitter:card': null, + }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + await archive.setResourcesReferrers({ + url: 'https://example.com/page-using-css', + src: 'https://example.com/used.css', + }); + + // One external resource — must NOT appear (inventory targets only + // in-scope files). + await archive.setResources({ + url: parseUrl('https://cdn.example.net/external.js')!, + isExternal: true, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'application/javascript', + contentLength: 200, + compress: false, + cdn: false, + headers: {}, + }); + + // Inventory-seed resource — must appear with the matching source label. + await archive.setResources( + { + url: parseUrl('https://example.com/inventory-seed.pdf')!, + isExternal: false, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'application/pdf', + contentLength: 1500, + compress: false, + cdn: false, + headers: {}, + }, + 'inventory-seed', + ); + + // Inventory-discovered resource — same isolation, different label. + await archive.setResources( + { + url: parseUrl('https://example.com/inventory-discovered.png')!, + isExternal: false, + isError: false, + status: 200, + statusText: 'OK', + contentType: 'image/png', + contentLength: 500, + compress: false, + cdn: false, + headers: {}, + }, + 'inventory-discovered', + ); + }); + + afterAll(async () => { + if (archive) { + await archive.close(); + } + const { rmSync } = await import('node:fs'); + rmSync(workingDir, { recursive: true, force: true }); + }); + + it('reports only internal resources with no referrers, excluding external resources and referenced ones', async () => { + const result = await listUnusedResources(archive); + + const urls = result.items.map((row) => row.url); + // Expected: orphan.pdf + inventory-seed.pdf + inventory-discovered.png. + // Excluded: used.css (has a referrer), external.js (isExternal=1). + expect(urls.toSorted()).toEqual([ + 'https://example.com/inventory-discovered.png', + 'https://example.com/inventory-seed.pdf', + 'https://example.com/orphan.pdf', + ]); + expect(urls).not.toContain('https://example.com/used.css'); + expect(urls).not.toContain('https://cdn.example.net/external.js'); + }); + + it('returns the source badge from the DB column (crawled / inventory-seed / inventory-discovered)', async () => { + const result = await listUnusedResources(archive); + const bySource: Record = {}; + for (const row of result.items) { + bySource[row.url] = row.source; + } + expect(bySource['https://example.com/orphan.pdf']).toBe('crawled'); + expect(bySource['https://example.com/inventory-seed.pdf']).toBe('inventory-seed'); + expect(bySource['https://example.com/inventory-discovered.png']).toBe( + 'inventory-discovered', + ); + }); + + it('respects limit and offset across the full unused set', async () => { + const first = await listUnusedResources(archive, { limit: 2, offset: 0 }); + expect(first.items).toHaveLength(2); + const second = await listUnusedResources(archive, { limit: 2, offset: 2 }); + expect(second.items).toHaveLength(1); + const third = await listUnusedResources(archive, { limit: 2, offset: 3 }); + expect(third.items).toHaveLength(0); + }); +}); diff --git a/packages/@nitpicker/query/src/list-unused-resources.ts b/packages/@nitpicker/query/src/list-unused-resources.ts new file mode 100644 index 0000000..1736dd2 --- /dev/null +++ b/packages/@nitpicker/query/src/list-unused-resources.ts @@ -0,0 +1,88 @@ +import type { + ListUnusedResourcesOptions, + PageSource, + UnusedResourceEntry, +} from './types.js'; +import type { ArchiveAccessor } from '@nitpicker/crawler'; +import type { Knex } from 'knex'; + +/** + * List internal sub-resources that no archived page references — "unused" + * server-side files (CSS / JS / images / PDFs / fonts / …) that the crawl + * touched the URL of (or that `crawl --inventory` registered) but no page + * actually loads. + * + * "Unused" is judged purely by the referrer table: + * `resources-referrers.resourceId IS NULL` means no `page → resource` edge + * exists. The `resources.source` value is IGNORED in the WHERE clause and + * returned only as a per-row badge, so a `'crawled'` resource that has lost + * all referrers and an `'inventory-seed'` resource that never gained one + * both surface here equally. + * + * External resources are excluded — only files served from the archived + * scope are inventory candidates for "candidates to delete". + * + * Read-only — safe against viewer / stub-mode archives. + * @param accessor - The archive accessor to query. + * @param options - Pagination options. + * @returns Paginated list of unused resources with their `source` badge. + */ +export async function listUnusedResources( + accessor: ArchiveAccessor, + options: ListUnusedResourcesOptions = {}, +): Promise<{ items: UnusedResourceEntry[]; total: number }> { + const knex = accessor.getKnex(); + const limit = options.limit ?? 100; + const offset = options.offset ?? 0; + + const baseWhere = (qb: Knex.QueryBuilder): Knex.QueryBuilder => + qb + .leftJoin( + 'resources-referrers', + 'resources.id', + '=', + 'resources-referrers.resourceId', + ) + .whereNull('resources-referrers.id') + .where('resources.isExternal', 0); + + const countResult = (await baseWhere(knex('resources')).count( + 'resources.id as total', + )) as { + total: number; + }[]; + const total = countResult[0]?.total ?? 0; + + const rows = (await baseWhere(knex('resources')) + .select( + 'resources.url', + 'resources.status', + 'resources.contentType', + 'resources.contentLength', + 'resources.source', + ) + .orderBy('resources.url') + .limit(limit) + .offset(offset)) as { + url: string; + status: number | null; + contentType: string | null; + contentLength: number | null; + source: string | null; + }[]; + + const items: UnusedResourceEntry[] = rows.map((row) => ({ + url: row.url, + status: row.status, + contentType: row.contentType, + contentLength: row.contentLength, + // Tolerate pre-migration archives where the column is absent — + // `?? 'crawled'` mirrors the DB DEFAULT. + source: (row.source ?? 'crawled') as PageSource, + })); + + return { + items, + total: Number(total), + }; +} diff --git a/packages/@nitpicker/query/src/query.ts b/packages/@nitpicker/query/src/query.ts index fc412ed..2ba123c 100644 --- a/packages/@nitpicker/query/src/query.ts +++ b/packages/@nitpicker/query/src/query.ts @@ -29,10 +29,12 @@ export { getSummary } from './get-summary.js'; export { getTagInventory } from './get-tag-inventory.js'; export { getViolations } from './get-violations.js'; export { listImages } from './list-images.js'; +export { listIsolatedPages } from './list-isolated-pages.js'; export { listLinks } from './list-links.js'; export { listPageLinks } from './list-page-links.js'; export { listPages } from './list-pages.js'; export { listPagesByJsonLdType } from './list-pages-by-jsonld-type.js'; export { listPagesByTag } from './list-pages-by-tag.js'; export { listResources } from './list-resources.js'; +export { listUnusedResources } from './list-unused-resources.js'; export * from './types.js'; diff --git a/packages/@nitpicker/query/src/types.ts b/packages/@nitpicker/query/src/types.ts index 4cfeae3..79fcf59 100644 --- a/packages/@nitpicker/query/src/types.ts +++ b/packages/@nitpicker/query/src/types.ts @@ -9,6 +9,63 @@ */ export type ArchiveMode = 'archive' | 'stub'; +// Re-export the canonical PageSource owned by the crawler package — keeps +// query consumers (CLI / MCP / viewer) from reaching across packages for +// the same enum. +export type { PageSource } from '@nitpicker/crawler'; + +/** + * One row of {@link listIsolatedPages} output — an HTML page that no + * other archived page anchors to (excluding archived roots). + */ +export interface IsolatedPageEntry { + /** Page URL. */ + url: string; + /** `` value, or `null` if absent. */ + title: string | null; + /** HTTP status of the page, or `null` if not yet known. */ + status: number | null; + /** Provenance label — see {@link PageSource}. Shown as a viewer badge. */ + source: import('@nitpicker/crawler').PageSource; +} + +/** + * Pagination options for {@link listIsolatedPages}. + */ +export interface ListIsolatedPagesOptions { + /** Maximum rows to return. Defaults to 100. */ + limit?: number; + /** Rows to skip from the start. Defaults to 0. */ + offset?: number; +} + +/** + * One row of {@link listUnusedResources} output — an internal sub-resource + * with zero referrers. + */ +export interface UnusedResourceEntry { + /** Resource URL. */ + url: string; + /** HTTP status of the resource, or `null` if not yet known. */ + status: number | null; + /** Content-Type header value, or `null` if unknown. */ + contentType: string | null; + /** Content-Length header value in bytes, or `null` if unknown. */ + contentLength: number | null; + /** Provenance label — see {@link PageSource}. Shown as a viewer badge. */ + source: import('@nitpicker/crawler').PageSource; +} + +/** + * Pagination options for {@link listUnusedResources}. + */ +export interface ListUnusedResourcesOptions { + /** Maximum rows to return. Defaults to 100. */ + limit?: number; + /** Rows to skip from the start. Defaults to 0. */ + offset?: number; +} + /** * Options for opening a .nitpicker archive file. */ diff --git a/packages/@nitpicker/viewer/e2e/viewer.spec.ts b/packages/@nitpicker/viewer/e2e/viewer.spec.ts index 7554493..47d9cac 100644 --- a/packages/@nitpicker/viewer/e2e/viewer.spec.ts +++ b/packages/@nitpicker/viewer/e2e/viewer.spec.ts @@ -26,7 +26,9 @@ test.describe('Nitpicker Viewer', () => { test('サイドバーから各ビューへ遷移できる', async ({ page }) => { await page.goto('/'); - await page.getByRole('link', { name: 'Resources' }).click(); + // `exact: true` so "Resources" does not also match the "Unused + // Resources" nav entry (substring match would resolve to 2 links). + await page.getByRole('link', { name: 'Resources', exact: true }).click(); await expect( page.getByRole('heading', { name: 'Resources', level: 1 }), ).toBeVisible(); diff --git a/packages/@nitpicker/viewer/src/create-app.ts b/packages/@nitpicker/viewer/src/create-app.ts index 9929a43..352ce10 100644 --- a/packages/@nitpicker/viewer/src/create-app.ts +++ b/packages/@nitpicker/viewer/src/create-app.ts @@ -9,6 +9,7 @@ import { registerErrorKindsRoute } from './routes/register-error-kinds-route.js' import { registerGraphRoute } from './routes/register-graph-route.js'; import { registerHeadersRoute } from './routes/register-headers-route.js'; import { registerImagesRoute } from './routes/register-images-route.js'; +import { registerIsolatedPagesRoute } from './routes/register-isolated-pages-route.js'; import { registerLinksRoute } from './routes/register-links-route.js'; import { registerMismatchesRoute } from './routes/register-mismatches-route.js'; import { registerPageDetailRoute } from './routes/register-page-detail-route.js'; @@ -18,6 +19,7 @@ import { registerPagesRoute } from './routes/register-pages-route.js'; import { registerResourceReferrersRoute } from './routes/register-resource-referrers-route.js'; import { registerResourcesRoute } from './routes/register-resources-route.js'; import { registerSummaryRoute } from './routes/register-summary-route.js'; +import { registerUnusedResourcesRoute } from './routes/register-unused-resources-route.js'; import { registerViolationsRoute } from './routes/register-violations-route.js'; import { sanitizeErrorMessage } from './sanitize-error-message.js'; @@ -50,6 +52,8 @@ export function createApp(options: CreateAppOptions): Hono { registerPageLinksRoute(app, context); registerArchiveInfoRoute(app, context); registerErrorKindsRoute(app, context); + registerIsolatedPagesRoute(app, context); + registerUnusedResourcesRoute(app, context); app.onError((error, c) => { const raw = error instanceof Error ? error.message : String(error); diff --git a/packages/@nitpicker/viewer/src/routes/register-isolated-pages-route.ts b/packages/@nitpicker/viewer/src/routes/register-isolated-pages-route.ts new file mode 100644 index 0000000..ce40ab5 --- /dev/null +++ b/packages/@nitpicker/viewer/src/routes/register-isolated-pages-route.ts @@ -0,0 +1,25 @@ +import type { ArchiveContext } from '../types.js'; +import type { Hono } from 'hono'; + +import { listIsolatedPages } from '@nitpicker/query'; + +import { toNumber } from '../query-params/to-number.js'; + +/** + * Registers `GET /api/isolated-pages` — internal HTML pages with no + * inbound anchors, excluding archived roots. Used by the viewer's + * "orphan LP" surface to highlight pages that the recursive crawl + * could not reach via the link graph. + * @param app - The Hono application. + * @param context - The opened archive context. + */ +export function registerIsolatedPagesRoute(app: Hono, context: ArchiveContext): void { + app.get('/api/isolated-pages', async (c) => { + const accessor = context.manager.get(context.archiveId); + const result = await listIsolatedPages(accessor, { + limit: toNumber(c.req.query('limit')), + offset: toNumber(c.req.query('offset')), + }); + return c.json(result); + }); +} diff --git a/packages/@nitpicker/viewer/src/routes/register-unused-resources-route.ts b/packages/@nitpicker/viewer/src/routes/register-unused-resources-route.ts new file mode 100644 index 0000000..07ab9c0 --- /dev/null +++ b/packages/@nitpicker/viewer/src/routes/register-unused-resources-route.ts @@ -0,0 +1,24 @@ +import type { ArchiveContext } from '../types.js'; +import type { Hono } from 'hono'; + +import { listUnusedResources } from '@nitpicker/query'; + +import { toNumber } from '../query-params/to-number.js'; + +/** + * Registers `GET /api/unused-resources` — internal sub-resources that no + * archived page references. Used by the viewer's "unused file" surface + * to highlight candidates for deletion from the server. + * @param app - The Hono application. + * @param context - The opened archive context. + */ +export function registerUnusedResourcesRoute(app: Hono, context: ArchiveContext): void { + app.get('/api/unused-resources', async (c) => { + const accessor = context.manager.get(context.archiveId); + const result = await listUnusedResources(accessor, { + limit: toNumber(c.req.query('limit')), + offset: toNumber(c.req.query('offset')), + }); + return c.json(result); + }); +} diff --git a/packages/@nitpicker/viewer/web/api/use-isolated-pages.ts b/packages/@nitpicker/viewer/web/api/use-isolated-pages.ts new file mode 100644 index 0000000..f0f0167 --- /dev/null +++ b/packages/@nitpicker/viewer/web/api/use-isolated-pages.ts @@ -0,0 +1,31 @@ +import type { IsolatedPageEntry } from '@nitpicker/query'; + +import { useQuery } from '@tanstack/react-query'; + +import { apiGet } from './api-client.js'; + +/** + * Result shape returned by the viewer's `/api/isolated-pages` endpoint. + * Mirrors `listIsolatedPages` from `@nitpicker/query` — duplicated here + * only so the React component can type the response without pulling + * the knex-bound query module client-side. + */ +export interface IsolatedPagesResult { + items: IsolatedPageEntry[]; + total: number; +} + +/** + * Fetches HTML pages with no inbound anchors (orphan LPs). Limit and offset + * pass through to {@link import('@nitpicker/query').listIsolatedPages}. + * @param limit - Maximum number of pages to return. + * @param offset - Number of pages to skip from the start. + * @returns The TanStack Query result for the isolated pages list. + */ +export function useIsolatedPages(limit = 100, offset = 0) { + return useQuery({ + queryKey: ['isolated-pages', limit, offset], + queryFn: () => + apiGet<IsolatedPagesResult>(`/api/isolated-pages?limit=${limit}&offset=${offset}`), + }); +} diff --git a/packages/@nitpicker/viewer/web/api/use-unused-resources.ts b/packages/@nitpicker/viewer/web/api/use-unused-resources.ts new file mode 100644 index 0000000..e34f7ce --- /dev/null +++ b/packages/@nitpicker/viewer/web/api/use-unused-resources.ts @@ -0,0 +1,31 @@ +import type { UnusedResourceEntry } from '@nitpicker/query'; + +import { useQuery } from '@tanstack/react-query'; + +import { apiGet } from './api-client.js'; + +/** + * Result shape returned by the viewer's `/api/unused-resources` endpoint. + * Mirrors `listUnusedResources` from `@nitpicker/query`. + */ +export interface UnusedResourcesResult { + items: UnusedResourceEntry[]; + total: number; +} + +/** + * Fetches internal sub-resources with zero referrers — candidates for + * deletion from the server. + * @param limit - Maximum rows to return. + * @param offset - Number of rows to skip from the start. + * @returns The TanStack Query result for the unused resources list. + */ +export function useUnusedResources(limit = 100, offset = 0) { + return useQuery({ + queryKey: ['unused-resources', limit, offset], + queryFn: () => + apiGet<UnusedResourcesResult>( + `/api/unused-resources?limit=${limit}&offset=${offset}`, + ), + }); +} diff --git a/packages/@nitpicker/viewer/web/app.tsx b/packages/@nitpicker/viewer/web/app.tsx index d9b9b5c..71c289d 100644 --- a/packages/@nitpicker/viewer/web/app.tsx +++ b/packages/@nitpicker/viewer/web/app.tsx @@ -11,6 +11,7 @@ import { ErrorsView } from './routes/errors-view.js'; import { GraphView } from './routes/graph-view.js'; import { HeadersView } from './routes/headers-view.js'; import { ImagesView } from './routes/images-view.js'; +import { IsolatedPagesView } from './routes/isolated-pages-view.js'; import { LinksView } from './routes/links-view.js'; import { MismatchesView } from './routes/mismatches-view.js'; import { PageDetailView } from './routes/page-detail-view.js'; @@ -18,6 +19,7 @@ import { PageLinksView } from './routes/page-links-view.js'; import { PagesView } from './routes/pages-view.js'; import { ResourcesView } from './routes/resources-view.js'; import { SummaryView } from './routes/summary-view.js'; +import { UnusedResourcesView } from './routes/unused-resources-view.js'; import { ViolationsView } from './routes/violations-view.js'; /** Shared TanStack Query client. Server data is read-only and rarely changes. */ @@ -64,6 +66,8 @@ export function App() { <Route path="/mismatches" element={<MismatchesView />} /> <Route path="/headers" element={<HeadersView />} /> <Route path="/errors" element={<ErrorsView />} /> + <Route path="/isolated-pages" element={<IsolatedPagesView />} /> + <Route path="/unused-resources" element={<UnusedResourcesView />} /> <Route path="*" element={<Navigate to="/" replace />} /> </Routes> </main> diff --git a/packages/@nitpicker/viewer/web/components/nav-sidebar.tsx b/packages/@nitpicker/viewer/web/components/nav-sidebar.tsx index ef07025..2011bbc 100644 --- a/packages/@nitpicker/viewer/web/components/nav-sidebar.tsx +++ b/packages/@nitpicker/viewer/web/components/nav-sidebar.tsx @@ -18,6 +18,8 @@ const NAV_ITEMS: NavItem[] = [ { path: '/mismatches', labelKey: 'nav.mismatches' }, { path: '/headers', labelKey: 'nav.headers' }, { path: '/errors', labelKey: 'nav.errors' }, + { path: '/isolated-pages', labelKey: 'nav.isolatedPages' }, + { path: '/unused-resources', labelKey: 'nav.unusedResources' }, ]; /** diff --git a/packages/@nitpicker/viewer/web/components/source-badge.tsx b/packages/@nitpicker/viewer/web/components/source-badge.tsx new file mode 100644 index 0000000..43c925e --- /dev/null +++ b/packages/@nitpicker/viewer/web/components/source-badge.tsx @@ -0,0 +1,44 @@ +import type { PageSource } from '@nitpicker/query'; + +/** + * Visual badge for the `pages.source` / `resources.source` provenance label + * returned by `listIsolatedPages` and `listUnusedResources`. + * + * Three values map to three visually distinct styles: + * + * - `crawled` — the row came from the recursive crawl; neutral styling so + * it does not draw the eye, since it is the common case. + * - `inventory-seed` — the URL was explicitly handed in by + * `crawl --inventory`; accent styling so the audit operator can quickly + * tell "this row is on my server file list". + * - `inventory-discovered` — the URL was found by following links from an + * inventory-seed page (or loaded by puppeteer as a sub-resource of one); + * muted accent so it groups visually with `inventory-seed` without + * dominating it. + * + * The badge's text is intentionally compact (`crawled` / `inv:seed` / + * `inv:disc`) so a tight table column does not wrap. Full meaning lives in + * the column header tooltip. + * @param props + * @param props.source - The {@link PageSource} value from the query result. + */ +export function SourceBadge({ source }: { source: PageSource }) { + const label = + source === 'inventory-seed' + ? 'inv:seed' + : source === 'inventory-discovered' + ? 'inv:disc' + : 'crawled'; + // Three distinct modifier classes so the audit operator can tell + // inventory-seed (the URL came straight off the list) from + // inventory-discovered (puppeteer-following from a seed) at a glance — + // keeping them on the same hue but at different intensities preserves + // the visual grouping ("both belong to the inventory pass") while + // still being distinguishable. + const className = `source-badge source-badge--${source}`; + return ( + <span className={className} title={source}> + {label} + </span> + ); +} diff --git a/packages/@nitpicker/viewer/web/i18n/translations.ts b/packages/@nitpicker/viewer/web/i18n/translations.ts index f9926b2..30093c7 100644 --- a/packages/@nitpicker/viewer/web/i18n/translations.ts +++ b/packages/@nitpicker/viewer/web/i18n/translations.ts @@ -32,6 +32,8 @@ export const translations: Record<Locale, Record<string, unknown>> = { graph: 'Graph', pageLinks: 'Page Links', errors: 'Errors', + isolatedPages: 'Isolated Pages', + unusedResources: 'Unused Resources', }, common: { loading: 'Loading…', @@ -231,6 +233,30 @@ export const translations: Record<Locale, Record<string, unknown>> = { hosts: 'Hosts', sampleUrls: 'Sample URLs', }, + isolatedPages: { + title: 'Isolated Pages', + description: + 'Internal HTML pages no other page links to (archived roots excluded). Combine with `crawl --inventory` to surface pages that only exist on the server.', + total: '{total} isolated pages', + empty: + 'No isolated pages — every internal HTML page has at least one inbound link.', + url: 'URL', + pageTitle: 'Title', + status: 'Status', + source: 'Source', + }, + unusedResources: { + title: 'Unused Resources', + description: + 'Internal sub-resources no archived page references — candidates for deletion. The source badge marks rows registered via `crawl --inventory`.', + total: '{total} unused resources', + empty: 'No unused resources — every internal resource has at least one referrer.', + url: 'URL', + status: 'Status', + contentType: 'Content-Type', + contentLength: 'Content-Length', + source: 'Source', + }, }, }, ja: { @@ -258,6 +284,8 @@ export const translations: Record<Locale, Record<string, unknown>> = { graph: 'グラフ', pageLinks: 'ページリンク', errors: 'エラー', + isolatedPages: '孤立ページ', + unusedResources: '未使用リソース', }, common: { loading: '読み込み中…', @@ -457,6 +485,31 @@ export const translations: Record<Locale, Record<string, unknown>> = { hosts: 'ホスト', sampleUrls: 'サンプル URL', }, + isolatedPages: { + title: '孤立ページ', + description: + '他のページからリンクされていない内部 HTML ページ(クロール起点は除外)。`crawl --inventory` と組み合わせて、サーバー上にしか存在しないページを浮かび上がらせます。', + total: '孤立ページ {total} 件', + empty: + '孤立ページなし — すべての内部 HTML ページに少なくとも 1 つの被リンクがあります。', + url: 'URL', + pageTitle: 'タイトル', + status: 'ステータス', + source: 'ソース', + }, + unusedResources: { + title: '未使用リソース', + description: + 'どのページからも参照されていない内部サブリソース — 削除候補。ソースバッジは `crawl --inventory` 経由で登録された行を示します。', + total: '未使用リソース {total} 件', + empty: + '未使用リソースなし — すべての内部リソースが少なくとも 1 つのページから参照されています。', + url: 'URL', + status: 'ステータス', + contentType: 'Content-Type', + contentLength: 'Content-Length', + source: 'ソース', + }, }, }, }; diff --git a/packages/@nitpicker/viewer/web/routes/isolated-pages-view.tsx b/packages/@nitpicker/viewer/web/routes/isolated-pages-view.tsx new file mode 100644 index 0000000..6f5da90 --- /dev/null +++ b/packages/@nitpicker/viewer/web/routes/isolated-pages-view.tsx @@ -0,0 +1,65 @@ +import { useIsolatedPages } from '../api/use-isolated-pages.js'; +import { SourceBadge } from '../components/source-badge.js'; +import { ViewHeader } from '../components/view-header.js'; +import { useI18n } from '../i18n/use-i18n.js'; + +/** + * Isolated pages view — lists internal HTML pages that no other page + * anchors to, excluding archived roots. The `source` badge on each row + * tells the operator whether a page was discovered by the original + * crawl (its links went missing later) or supplied via + * `crawl --inventory` (the URL only exists on the server, never linked). + * @returns The isolated pages view element. + */ +export function IsolatedPagesView() { + const { t } = useI18n(); + const { data, isLoading, error } = useIsolatedPages(); + + if (isLoading) { + return <div className="state">{t('common.loading')}</div>; + } + if (error) { + return <div className="state state-error">{error.message}</div>; + } + if (!data) { + return null; + } + + return ( + <div> + <ViewHeader + titleKey="views.isolatedPages.title" + descriptionKey="views.isolatedPages.description" + /> + <p className="state">{t('views.isolatedPages.total', { total: data.total })}</p> + {data.items.length === 0 ? ( + <div className="state">{t('views.isolatedPages.empty')}</div> + ) : ( + <table className="data-table"> + <thead> + <tr> + <th>{t('views.isolatedPages.url')}</th> + <th>{t('views.isolatedPages.pageTitle')}</th> + <th>{t('views.isolatedPages.status')}</th> + <th>{t('views.isolatedPages.source')}</th> + </tr> + </thead> + <tbody> + {data.items.map((row) => ( + <tr key={row.url}> + <td> + <code>{row.url}</code> + </td> + <td>{row.title ?? '—'}</td> + <td>{row.status ?? '—'}</td> + <td> + <SourceBadge source={row.source} /> + </td> + </tr> + ))} + </tbody> + </table> + )} + </div> + ); +} diff --git a/packages/@nitpicker/viewer/web/routes/unused-resources-view.tsx b/packages/@nitpicker/viewer/web/routes/unused-resources-view.tsx new file mode 100644 index 0000000..3dfd7e3 --- /dev/null +++ b/packages/@nitpicker/viewer/web/routes/unused-resources-view.tsx @@ -0,0 +1,67 @@ +import { useUnusedResources } from '../api/use-unused-resources.js'; +import { SourceBadge } from '../components/source-badge.js'; +import { ViewHeader } from '../components/view-header.js'; +import { useI18n } from '../i18n/use-i18n.js'; + +/** + * Unused resources view — lists internal sub-resources that no archived + * page references. The `source` badge on each row tells the operator + * whether a resource was once referenced and lost its referrers + * (`crawled`) or was registered straight from the server file list + * (`inventory-seed`) — useful when deciding what to delete. + * @returns The unused resources view element. + */ +export function UnusedResourcesView() { + const { t } = useI18n(); + const { data, isLoading, error } = useUnusedResources(); + + if (isLoading) { + return <div className="state">{t('common.loading')}</div>; + } + if (error) { + return <div className="state state-error">{error.message}</div>; + } + if (!data) { + return null; + } + + return ( + <div> + <ViewHeader + titleKey="views.unusedResources.title" + descriptionKey="views.unusedResources.description" + /> + <p className="state">{t('views.unusedResources.total', { total: data.total })}</p> + {data.items.length === 0 ? ( + <div className="state">{t('views.unusedResources.empty')}</div> + ) : ( + <table className="data-table"> + <thead> + <tr> + <th>{t('views.unusedResources.url')}</th> + <th>{t('views.unusedResources.status')}</th> + <th>{t('views.unusedResources.contentType')}</th> + <th>{t('views.unusedResources.contentLength')}</th> + <th>{t('views.unusedResources.source')}</th> + </tr> + </thead> + <tbody> + {data.items.map((row) => ( + <tr key={row.url}> + <td> + <code>{row.url}</code> + </td> + <td>{row.status ?? '—'}</td> + <td>{row.contentType ?? '—'}</td> + <td>{row.contentLength ?? '—'}</td> + <td> + <SourceBadge source={row.source} /> + </td> + </tr> + ))} + </tbody> + </table> + )} + </div> + ); +} diff --git a/packages/@nitpicker/viewer/web/styles.css b/packages/@nitpicker/viewer/web/styles.css index 6de3358..b5c64c0 100644 --- a/packages/@nitpicker/viewer/web/styles.css +++ b/packages/@nitpicker/viewer/web/styles.css @@ -1000,3 +1000,33 @@ button.bar-row:hover { transition: none; } } + +/* + * Provenance badge used by the Isolated Pages / Unused Resources views to + * mark which channel inserted a row. Three styles so an operator can + * scan a long table and spot inventory-sourced rows at a glance. + */ +.source-badge { + display: inline flow-root; + padding-block: 1px; + padding-inline: 6px; + font-family: monospace; + font-size: var(--font-size-xs); + white-space: nowrap; + border-radius: 3px; +} + +.source-badge--crawled { + color: var(--text-dim); + background: color-mix(in srgb, var(--text-dim) 14%, transparent); +} + +.source-badge--inventory-seed { + color: var(--badge-warn-fg); + background: var(--badge-warn-bg); +} + +.source-badge--inventory-discovered { + color: var(--badge-warn-fg); + background: color-mix(in srgb, var(--badge-warn-fg) 10%, transparent); +} diff --git a/packages/test-server/src/__tests__/e2e/inventory.e2e.ts b/packages/test-server/src/__tests__/e2e/inventory.e2e.ts new file mode 100644 index 0000000..19e0942 --- /dev/null +++ b/packages/test-server/src/__tests__/e2e/inventory.e2e.ts @@ -0,0 +1,128 @@ +import crypto from 'node:crypto'; +import fs from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { Archive, CrawlerOrchestrator } from '@nitpicker/crawler'; +import { listIsolatedPages, listUnusedResources } from '@nitpicker/query'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +/** + * Run a baseline crawl rooted at the test-server top page, then close the + * archive so the caller can re-open it via `Archive.open` / `inventory()`. + * The baseline reaches `/`, `/about`, and a handful of resource URLs — but + * NOT the `/inventory/*` fixture routes (they are deliberately unlinked). + * @param urls - One or more URLs to crawl. + * @returns Paths to the produced archive and its cwd. + */ +async function crawlAndPersist( + urls: string[], +): Promise<{ filePath: string; cwd: string }> { + const cwd = path.join(os.tmpdir(), `nitpicker-inventory-${crypto.randomUUID()}`); + await fs.mkdir(cwd, { recursive: true }); + + const orchestrator = await CrawlerOrchestrator.crawling(urls, { + cwd, + interval: 0, + parallels: 1, + image: false, + fetchExternal: false, + }); + const filePath = orchestrator.archive.filePath; + await orchestrator.write(); + await orchestrator.archive.close(); + orchestrator.garbageCollect(); + + return { filePath, cwd }; +} + +describe('Inventory crawl', () => { + let filePath: string; + let cwd: string; + let accessor: Archive; + + beforeAll(async () => { + // 1) Baseline crawl reaches only the top page and its anchors — + // /inventory/* routes are never linked from the crawl-reachable graph. + const baseline = await crawlAndPersist(['http://localhost:8010/']); + filePath = baseline.filePath; + cwd = baseline.cwd; + + // 2) Inventory pass with both an HTML URL and a non-HTML URL the + // crawl could not reach. The HTML one (hidden-lp) should be + // rendered + drive recursive discovery of inner-link; the PDF + // should be HEAD-only and land directly in resources. + const orchestrator = await CrawlerOrchestrator.inventory( + filePath, + [ + 'http://localhost:8010/inventory/hidden-lp', + 'http://localhost:8010/inventory/orphan.pdf', + ], + { cwd }, + ); + await orchestrator.write(); + await orchestrator.archive.close(); + orchestrator.garbageCollect(); + + // Re-open for read-side assertions via the writer-side `Archive.open` + // (it expands the tar into a working tmpDir, then we close it after + // the suite). `Archive.connect` only works on stub directories, not + // on packed `.nitpicker` tars. + accessor = await Archive.open({ filePath, cwd }); + }, 120_000); + + afterAll(async () => { + if (accessor) { + await accessor.close(); + } + await fs.rm(cwd, { recursive: true, force: true }); + }); + + it('labels the URL-list HTML page as inventory-seed', async () => { + const rows = await listIsolatedPages(accessor, { limit: 50 }); + const hidden = rows.items.find( + (row) => row.url === 'http://localhost:8010/inventory/hidden-lp', + ); + expect(hidden, 'hidden-lp must be present in isolated pages').toBeDefined(); + expect(hidden?.source).toBe('inventory-seed'); + }); + + it('labels pages discovered by following links from a seed as inventory-discovered', async () => { + // inner-link is anchored from hidden-lp, so by definition it is NOT + // isolated — the listIsolatedPages helper rightfully filters it out. + // Probe the raw `pages` row directly to verify the source label was + // written; this is the only assertion that nails down the + // `derivePageSource` → `Archive.setPage` → DB INSERT path for the + // inventory-discovered case. + const knex = accessor.getKnex(); + const [row] = (await knex('pages') + .select('source') + .where('url', 'http://localhost:8010/inventory/inner-link')) as { + source: string; + }[]; + expect( + row, + 'inner-link must have been inserted by the recursive crawl', + ).toBeDefined(); + expect(row?.source).toBe('inventory-discovered'); + }); + + it('records a non-HTML URL from the inventory list directly in resources as inventory-seed', async () => { + const rows = await listUnusedResources(accessor, { limit: 50 }); + const orphan = rows.items.find( + (row) => row.url === 'http://localhost:8010/inventory/orphan.pdf', + ); + expect(orphan, 'orphan.pdf must be present in unused resources').toBeDefined(); + expect(orphan?.source).toBe('inventory-seed'); + expect(orphan?.contentType).toContain('pdf'); + }); + + // Note: a "second inventory pass keeps the existing source label" + // scenario is intentionally left out here. The non-destructive property + // is enforced at the SQL layer — `#getIdByUrl`'s `ON CONFLICT IGNORE` + // path is exercised by the migration / database specs, and the + // existing-URL filter (`getExistingPageUrls`) keeps the second pass + // from even reaching the INSERT — so an E2E re-pass would only retest + // what those unit tests already cover, at the cost of running a full + // browser-render crawl twice in CI. +}); diff --git a/packages/test-server/src/routes/inventory.ts b/packages/test-server/src/routes/inventory.ts new file mode 100644 index 0000000..9eec2b7 --- /dev/null +++ b/packages/test-server/src/routes/inventory.ts @@ -0,0 +1,40 @@ +import type { Hono } from 'hono'; + +/** + * Registers "inventory fixture" routes — pages and files that exist on the + * server but no crawl-visible page links to. The `crawl --inventory` + * feature is built to surface these: + * + * - `/inventory/hidden-lp` — an HTML landing page no anchor points at. + * Used by the orchestrator E2E to verify `'inventory-seed'` labelling. + * - `/inventory/inner-link` — linked only from `/inventory/hidden-lp`, + * so it surfaces ONLY when the hidden LP is rendered as a seed and the + * recursive crawl follows the link. Used to verify + * `'inventory-discovered'` labelling. + * - `/inventory/orphan.pdf` — a non-HTML asset no page references. Used + * to verify that HEAD-only inventory inserts surface as + * `'inventory-seed'` in `resources`. + * @param app - The Hono application instance to register routes on. + */ +export function inventoryRoutes(app: Hono) { + app.get('/inventory/hidden-lp', (c) => + c.html( + `<!doctype html><html lang="en"><head><title>Hidden LP +Inner link +`, + ), + ); + + app.get('/inventory/inner-link', (c) => + c.html( + `Inner Link Page +

Discovered via the hidden LP.

+`, + ), + ); + + app.get('/inventory/orphan.pdf', (c) => { + c.header('Content-Type', 'application/pdf'); + return c.body('%PDF-1.4 stub body for tests'); + }); +} diff --git a/packages/test-server/src/server.ts b/packages/test-server/src/server.ts index 2864812..777daf3 100644 --- a/packages/test-server/src/server.ts +++ b/packages/test-server/src/server.ts @@ -7,6 +7,7 @@ import { basicRoutes } from './routes/basic.js'; import { errorStatusRoutes } from './routes/error-status.js'; import { excludeRoutes } from './routes/exclude.js'; import { flakyRoutes } from './routes/flaky.js'; +import { inventoryRoutes } from './routes/inventory.js'; import { metaRoutes } from './routes/meta.js'; import { optionsRoutes } from './routes/options.js'; import { paginationRoutes } from './routes/pagination.js'; @@ -35,6 +36,7 @@ export function createApp() { scrollJackRoutes(app); resourceReuseRoutes(app); flakyRoutes(app); + inventoryRoutes(app); return app; }