Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 42 additions & 4 deletions packages/@d-zero/beholder/src/scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@
const log = scraperLog.extend(pid);
const rLog = resourceLog.extend(pid);

/**
* Upper bound for `document.body.scrollHeight` tolerated by `#fetchImages`.
* Pages exceeding this at a given device preset are skipped to keep
* `scrollAllOver` from running long enough to outlast the @retryable

Check warning on line 44 in packages/@d-zero/beholder/src/scraper.ts

View workflow job for this annotation

GitHub Actions / test (macOS-latest, 24)

Unexpected inline JSDoc tag. Did you mean to use {@retryable}, \@retryable, or `@retryable`?

Check warning on line 44 in packages/@d-zero/beholder/src/scraper.ts

View workflow job for this annotation

GitHub Actions / test (windows-latest, 24)

Unexpected inline JSDoc tag. Did you mean to use {@retryable}, \@retryable, or `@retryable`?
* timeout and collide with a follow-up retry on the same Puppeteer page.
*
* 1,000,000 px is roughly 3× the worst real-world value we have measured
* (a responsive data-table page reached ~321k px at 320px viewport), so
* normal responsive sites complete well within the 20 min retry budget.
*/
const MAX_SCROLL_HEIGHT = 1_000_000;

/**
* Page-level scraper that extracts data from a single browser page.
*
Expand Down Expand Up @@ -698,9 +710,23 @@
* changes and triggers a reload. Isolating each device preset allows partial
* results — if one viewport fails, the other can still succeed.
*
* WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
* WHY retryable with 20-min timeout and `fallback: []`: Image extraction is
* best-effort. If all retries fail, an empty array is returned rather than
* failing the entire page scrape.
* failing the entire page scrape. The 20-min wall clock accommodates pages
* whose mobile-small `scrollHeight` reaches ~300k px (observed on
* responsive data tables, which take ~5 min to scroll). A shorter timeout
* causes a second retry to start while the previous attempt's
* `scrollAllOver` is still running its `page.evaluate` calls in the
* background — `Promise.race` in `retry.ts` does not cancel `fn()`. The
* collision then surfaces as "Attempted to use detached Frame" or
* "Session closed" when the new attempt's reload / setViewport runs on
* the same page as the old attempt's pending evaluates.
*
* WHY pass `maxScrollHeight`: Even 20 min is not enough for pathological
* pages whose layout explodes at narrow viewports. Skipping the device
* preset entirely keeps the timeout-vs-background-evaluate collision from
* ever being triggered, at the cost of losing that viewport's image data
* for those pages. See {@link MAX_SCROLL_HEIGHT} for the chosen threshold.
* @param page - Puppeteer page instance
* @param url - The page URL string (without hash and auth)
* @param isExternal - Whether the page is external
Expand All @@ -709,7 +735,7 @@
* @returns Array of image elements from all device presets (may be partial if some viewports failed)
*/
@retryable({
timeout: 5 * 60 * 1000,
timeout: 20 * 60 * 1000,
fallback: [],
onWait(this: Scraper, determinedInterval, retryCount, methodName, error) {
void this.emit('changePhase', {
Expand Down Expand Up @@ -754,14 +780,26 @@
message: `📷 ${key} ↔️ ${preset.width}px`,
});

await beforePageScan(page, url, {
const scanResult = await beforePageScan(page, url, {
name: key,
width: preset.width,
resolution: preset.resolution,
listener,
timeout: 5000,
maxScrollHeight: MAX_SCROLL_HEIGHT,
});

if (!scanResult.scrolled) {
void this.emit('changePhase', {
pid: process.pid,
name: 'retryExhausted',
url: null,
isExternal: false,
message: `📷 ${key}: skipped — scrollHeight ${scanResult.scrollHeight} exceeds limit ${MAX_SCROLL_HEIGHT}`,
});
continue;
}

void this.emit('changePhase', {
pid: process.pid,
name: 'waitImageLoad',
Expand Down
104 changes: 101 additions & 3 deletions packages/@d-zero/puppeteer-page-scan/src/before-page-scan.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ vi.mock('@d-zero/puppeteer-scroll', () => ({

/**
*
* @param scrollHeight
*/
function createMockPage(): Page {
function createMockPage(scrollHeight = 0): Page {
return {
url: vi.fn(() => 'about:blank'),
setViewport: vi.fn(() => Promise.resolve()),
goto: vi.fn(() => Promise.resolve()),
reload: vi.fn(() => Promise.resolve()),
evaluate: vi.fn(() => Promise.resolve()),
evaluate: vi.fn(() => Promise.resolve(scrollHeight)),
} as unknown as Page;
}

Expand Down Expand Up @@ -130,7 +131,7 @@ describe('beforePageScan → hooks の呼び出し', () => {
name: 'test',
width: 1024,
}),
).resolves.toBeUndefined();
).resolves.toEqual({ scrolled: true, scrollHeight: 0 });
});

it('hooks の途中で throw した場合、後続の hook は呼ばれず例外が伝搬する', async () => {
Expand Down Expand Up @@ -171,3 +172,100 @@ describe('beforePageScan → hooks の呼び出し', () => {
});
});
});

describe('beforePageScan → maxScrollHeight ガード', () => {
beforeEach(() => {
vi.mocked(scrollAllOver).mockClear();
});

it('scrollHeight が maxScrollHeight を超えるとき scrollAllOver を呼ばず scrolled:false を返す', async () => {
const page = createMockPage(2_000_000);
const listener = vi.fn();

const result = await beforePageScan(page, 'https://example.com', {
name: 'mobile-small',
width: 320,
maxScrollHeight: 1_000_000,
listener,
});

expect(result).toEqual({ scrolled: false, scrollHeight: 2_000_000 });
expect(scrollAllOver).not.toHaveBeenCalled();
expect(listener).toHaveBeenCalledWith('hook', {
name: 'mobile-small',
message: 'Skipped scroll: scrollHeight 2000000 exceeds limit 1000000',
});
});

it('scrollHeight が maxScrollHeight 以下のとき scrollAllOver を呼んで scrolled:true を返す', async () => {
const page = createMockPage(500_000);

const result = await beforePageScan(page, 'https://example.com', {
name: 'mobile-small',
width: 320,
maxScrollHeight: 1_000_000,
});

expect(result).toEqual({ scrolled: true, scrollHeight: 500_000 });
expect(scrollAllOver).toHaveBeenCalledTimes(1);
});

it('maxScrollHeight 未指定のときは scrollHeight にかかわらず scrollAllOver を呼ぶ', async () => {
const page = createMockPage(99_999_999);

const result = await beforePageScan(page, 'https://example.com', {
name: 'test',
width: 1024,
});

expect(result).toEqual({ scrolled: true, scrollHeight: 99_999_999 });
expect(scrollAllOver).toHaveBeenCalledTimes(1);
});

it('scrollHeight が maxScrollHeight と等しいとき(境界)は scroll する', async () => {
const page = createMockPage(1_000_000);

const result = await beforePageScan(page, 'https://example.com', {
name: 'test',
width: 320,
maxScrollHeight: 1_000_000,
});

expect(result).toEqual({ scrolled: true, scrollHeight: 1_000_000 });
expect(scrollAllOver).toHaveBeenCalledTimes(1);
});

it('maxScrollHeight: 0 を指定すると undefined と区別され、scrollHeight が 0 のときのみ scroll する', async () => {
const page = createMockPage(0);

const result = await beforePageScan(page, 'https://example.com', {
name: 'test',
width: 320,
maxScrollHeight: 0,
});

expect(result).toEqual({ scrolled: true, scrollHeight: 0 });
expect(scrollAllOver).toHaveBeenCalledTimes(1);
});

it('page.evaluate が reject した場合 beforePageScan も reject し、scrollAllOver は呼ばれない', async () => {
const page = {
url: vi.fn(() => 'about:blank'),
setViewport: vi.fn(() => Promise.resolve()),
goto: vi.fn(() => Promise.resolve()),
reload: vi.fn(() => Promise.resolve()),
evaluate: vi.fn(() =>
Promise.reject(new Error("Attempted to use detached Frame 'XXX'.")),
),
} as unknown as Page;

await expect(
beforePageScan(page, 'https://example.com', {
name: 'mobile-small',
width: 320,
maxScrollHeight: 1_000_000,
}),
).rejects.toThrow("Attempted to use detached Frame 'XXX'.");
expect(scrollAllOver).not.toHaveBeenCalled();
});
});
49 changes: 46 additions & 3 deletions packages/@d-zero/puppeteer-page-scan/src/before-page-scan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,27 @@ type Options = {
openDisclosures?: boolean;
scrollInterval?: number | DelayOptions;
scrollDistance?: number | DelayOptions;
/**
* Maximum `document.body.scrollHeight` (px) tolerated before `scrollAllOver`
* is skipped. Pages whose post-load scrollHeight exceeds this threshold
* return `{ scrolled: false, scrollHeight }` without scrolling, so callers
* can decide to abandon the device preset rather than letting the scroll
* run unbounded. Omit to disable the check (legacy behavior).
*/
maxScrollHeight?: number;
} & Size;

export type BeforePageScanResult = {
/**
* `true` when `scrollAllOver` ran to completion (or to a stuck bail-out).
* `false` when the scroll was skipped because `scrollHeight` exceeded
* `maxScrollHeight`.
*/
scrolled: boolean;
/** `document.body.scrollHeight` measured immediately before scroll. */
scrollHeight: number;
};

/**
* Open all disclosure elements on the page
* This function loops until all disclosure elements are expanded,
Expand Down Expand Up @@ -88,12 +107,17 @@ async function openAllDisclosures(
* @param url
* @param options
*/
export async function beforePageScan(page: Page, url: string, options?: Options) {
export async function beforePageScan(
page: Page,
url: string,
options?: Options,
): Promise<BeforePageScanResult> {
const listener = options?.listener;
const name = options?.name ?? 'default';
const width = options?.width ?? 1400;
const resolution = options?.resolution;
const timeout = options?.timeout || 5000;
const maxScrollHeight = options?.maxScrollHeight;
const countDownId = `${name}${url}_timeout`;

listener?.('setViewport', { name, width, resolution });
Expand Down Expand Up @@ -131,6 +155,23 @@ export async function beforePageScan(page: Page, url: string, options?: Options)
});
}

// WHY measure before scrollAllOver: pathological pages can have a
// post-load scrollHeight of millions of pixels (e.g. responsive data
// tables that expand to ~321k px at 320px viewport, and worse cases exist).
// `scrollAllOver` has no upper bound, so without this guard it can run
// for tens of minutes — long enough to exceed any reasonable retry
// timeout, leaving the scroll's page.evaluate calls executing in the
// background while the next retry attempts to use the same page.
const scrollHeight = await page.evaluate(() => document.body.scrollHeight);

if (maxScrollHeight !== undefined && scrollHeight > maxScrollHeight) {
listener?.('hook', {
name,
message: `Skipped scroll: scrollHeight ${scrollHeight} exceeds limit ${maxScrollHeight}`,
});
return { scrolled: false, scrollHeight };
}

listener?.('scroll', {
name,
scrollY: 0,
Expand All @@ -140,9 +181,11 @@ export async function beforePageScan(page: Page, url: string, options?: Options)
await scrollAllOver(page, {
interval: options?.scrollInterval,
distance: options?.scrollDistance,
logger: (scrollY, scrollHeight, message) =>
listener?.('scroll', { name, scrollY, scrollHeight, message }),
logger: (scrollY, scrollHeightCurrent, message) =>
listener?.('scroll', { name, scrollY, scrollHeight: scrollHeightCurrent, message }),
});

return { scrolled: true, scrollHeight };
}

/**
Expand Down
1 change: 1 addition & 0 deletions packages/@d-zero/puppeteer-page-scan/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export { beforePageScan } from './before-page-scan.js';
export type { BeforePageScanResult } from './before-page-scan.js';
export {
defaultSizes,
devicePresets,
Expand Down
Loading