diff --git a/packages/@d-zero/beholder/README.md b/packages/@d-zero/beholder/README.md
index 5a462994..290e7fd2 100644
--- a/packages/@d-zero/beholder/README.md
+++ b/packages/@d-zero/beholder/README.md
@@ -32,3 +32,29 @@ if (result.type === 'success') {
```
設計判断(イベントではなく戻り値で返す理由、`page` のライフサイクル責務、リトライ機構など)は `src/scraper.ts` の JSDoc を参照。
+
+## DOM 文字列からメタ抽出(Puppeteer なし)
+
+HTML 文字列を jsdom などでパースしてから `Meta` を取り出したい場合、`extractMetaFromDocument` を使う。`Scraper` が内部で呼ぶ `collectHead → detectTags → classify` パイプラインと同じ実装を再利用するため、戻り値の `Meta` 形状は `scrapeStart` と同一。DOM ライブラリ(jsdom 等)はユーザランドの責務。
+
+```ts
+import { extractMetaFromDocument } from '@d-zero/beholder';
+import { JSDOM } from 'jsdom';
+
+const url = 'https://example.com/';
+const html = await (await fetch(url)).text();
+const dom = new JSDOM(html, { url });
+
+// `as unknown as Window` は jsdom の `DOMWindow` 型が lib.dom の `Window` と
+// 構造的に完全一致しないための型キャスト。ランタイムでは互換。
+const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
+ url,
+ html,
+});
+
+console.log(meta.title);
+console.log(meta.og?.image);
+console.log(meta.tags.entries);
+```
+
+`context.html` を省略すると `window.document.documentElement.outerHTML` がフォールバックされる。ただし Wappalyzer の HTML パターンはスクリプト実行前の生 HTML に合わせて作られているので、可能なら取得直後の HTML 文字列を明示的に渡す方が検出が安定する。
diff --git a/packages/@d-zero/beholder/package.json b/packages/@d-zero/beholder/package.json
index bd2712a1..25fdc88b 100644
--- a/packages/@d-zero/beholder/package.json
+++ b/packages/@d-zero/beholder/package.json
@@ -27,7 +27,9 @@
"simple-wappalyzer": "1.1.99"
},
"devDependencies": {
- "@types/debug": "4.1.12"
+ "@types/debug": "4.1.12",
+ "@types/jsdom": "28.0.3",
+ "jsdom": "29.1.1"
},
"repository": {
"type": "git",
diff --git a/packages/@d-zero/beholder/src/dom-evaluation.ts b/packages/@d-zero/beholder/src/dom-evaluation.ts
index 6efa8c93..ddbf8c78 100644
--- a/packages/@d-zero/beholder/src/dom-evaluation.ts
+++ b/packages/@d-zero/beholder/src/dom-evaluation.ts
@@ -22,6 +22,7 @@ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
import { domDetailsLog, domLog } from './debug.js';
import { classify, emptyMeta } from './meta/classify.js';
+import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
import { detectTags } from './meta/tag-detection.js';
import { parseUrl } from './parse-url.js';
@@ -515,46 +516,6 @@ export type GetMetaContext = {
readonly includeRaw?: boolean;
};
-const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
- 'dataLayer',
- 'gtag',
- 'ga',
- '_gaq',
- 'fbq',
- '_fbq',
- 'clarity',
- '_hjSettings',
- '_hjid',
- 'twq',
- 'ttq',
- '_linkedin_partner_id',
- 'pintrk',
- 'amplitude',
- 'mixpanel',
- 'analytics',
- 'heap',
- 'posthog',
- 'plausible',
- 'fathom',
- '_paq',
- 's_account',
- 's',
- 'ym',
- 'UET',
- 'optimizely',
- '_hsq',
- 'Sentry',
- 'Intercom',
- 'intercomSettings',
- 'drift',
- 'Tawk_API',
- 'zE',
- 'OneTrust',
- 'Cookiebot',
- 'Stripe',
- 'grecaptcha',
-];
-
/**
* Extracts comprehensive metadata from the page.
*
@@ -639,129 +600,27 @@ async function runGetMeta(page: Page, context: GetMetaContext): Promise` entries from a Puppeteer page by injecting
+ * {@link collectHeadFromDocument} into the page realm.
*
- * @param page
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
+ * implementation lives in this module (`collectHeadFromDocument`), and a
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
+ * reach that module-scope binding inside the page realm — only the wrapper's
+ * own source crosses the CDP boundary. Serializing the implementation via
+ * `Function.prototype.toString` and invoking it through
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
+ * jsdom path on one source of truth.
+ *
+ * The same {@link collectHeadFromDocument} function is also exposed via
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
+ * so the two paths cannot drift apart.
+ * @param page - The Puppeteer page whose document will be inspected.
*/
async function collectHeadOnPage(page: Page): Promise {
- const raw = await page
- .evaluate((knownGlobals: readonly string[]) => {
- /* global document, HTMLLinkElement, HTMLMetaElement, HTMLBaseElement,
- HTMLScriptElement, HTMLIFrameElement */
- type Out = unknown;
- const entries: Out[] = [];
-
- const html = document.documentElement;
- entries.push(
- {
- kind: 'html',
- lang: html.lang || undefined,
- dir: html.dir || undefined,
- xmlns: html.getAttribute('xmlns') ?? undefined,
- prefix: html.getAttribute('prefix') ?? undefined,
- vocab: html.getAttribute('vocab') ?? undefined,
- typeOf: html.getAttribute('typeof') ?? undefined,
- itemscope: html.hasAttribute('itemscope') || undefined,
- itemtype: html.getAttribute('itemtype') ?? undefined,
- amp: html.hasAttribute('amp') || undefined,
- lightning: html.hasAttribute('⚡') || undefined,
- },
- { kind: 'title', content: document.title },
- );
-
- for (const base of document.querySelectorAll('base')) {
- if (!(base instanceof HTMLBaseElement)) continue;
- entries.push({
- kind: 'base',
- href: base.getAttribute('href') ?? undefined,
- target: base.getAttribute('target') ?? undefined,
- });
- }
-
- for (const meta of document.querySelectorAll('meta')) {
- if (!(meta instanceof HTMLMetaElement)) continue;
- const name = meta.getAttribute('name');
- const property = meta.getAttribute('property');
- const httpEquiv = meta.getAttribute('http-equiv');
- const itemprop = meta.getAttribute('itemprop');
- const charset = meta.getAttribute('charset');
- const content = meta.getAttribute('content');
- const media = meta.getAttribute('media');
- entries.push({
- kind: 'meta',
- name: name ? name.toLowerCase() : undefined,
- property: property ? property.toLowerCase() : undefined,
- httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
- itemprop: itemprop ?? undefined,
- charset: charset ?? undefined,
- content: content ?? undefined,
- media: media ?? undefined,
- });
- }
-
- for (const link of document.querySelectorAll('link[href]')) {
- if (!(link instanceof HTMLLinkElement)) continue;
- const relRaw = link.getAttribute('rel') ?? '';
- const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
- entries.push({
- kind: 'link',
- rel,
- href: link.getAttribute('href') ?? '',
- type: link.getAttribute('type') ?? undefined,
- media: link.getAttribute('media') ?? undefined,
- sizes: link.getAttribute('sizes') ?? undefined,
- title: link.getAttribute('title') ?? undefined,
- hreflang: link.getAttribute('hreflang') ?? undefined,
- as: link.getAttribute('as') ?? undefined,
- crossorigin: link.getAttribute('crossorigin') ?? undefined,
- color: link.getAttribute('color') ?? undefined,
- blocking: link.getAttribute('blocking') ?? undefined,
- imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
- });
- }
-
- const STRUCTURED_TYPES = new Set([
- 'application/ld+json',
- 'speculationrules',
- 'application/json+oembed',
- 'application/xml+oembed',
- ]);
- for (const script of document.querySelectorAll('script[type]')) {
- if (!(script instanceof HTMLScriptElement)) continue;
- const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
- if (!STRUCTURED_TYPES.has(scriptType)) continue;
- const src = script.getAttribute('src') ?? undefined;
- const text = script.textContent ?? '';
- const inHead = !!script.closest('head');
- const inNoscript = !!script.closest('noscript');
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
- entries.push({
- kind: 'script',
- scriptType,
- content: text || undefined,
- src,
- location,
- });
- }
-
- for (const iframe of document.querySelectorAll('iframe[src]')) {
- if (!(iframe instanceof HTMLIFrameElement)) continue;
- const src = iframe.getAttribute('src') ?? '';
- if (!src) continue;
- const inHead = !!iframe.closest('head');
- const inNoscript = !!iframe.closest('noscript');
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
- entries.push({ kind: 'iframe', src, location });
- }
-
- const win = window as unknown as Record;
- const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
- if (presentGlobals.length > 0) {
- entries.push({ kind: 'window-global', names: presentGlobals });
- }
-
- return entries;
- }, WINDOW_GLOBALS_TO_CHECK)
- .catch(() => [] as unknown[]);
-
+ const fnSource = collectHeadFromDocument.toString();
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
+ const raw = await page.evaluate(expr).catch(() => [] as unknown[]);
return raw as RawHeadEntry[];
}
diff --git a/packages/@d-zero/beholder/src/extract-meta.spec.ts b/packages/@d-zero/beholder/src/extract-meta.spec.ts
new file mode 100644
index 00000000..582a5edb
--- /dev/null
+++ b/packages/@d-zero/beholder/src/extract-meta.spec.ts
@@ -0,0 +1,247 @@
+import { JSDOM } from 'jsdom';
+import { describe, expect, it } from 'vitest';
+
+import { extractMetaFromDocument } from './extract-meta.js';
+
+const URL = 'https://example.com/';
+
+/**
+ *
+ * @param html
+ */
+function mkDom(html: string): JSDOM {
+ return new JSDOM(html, { url: URL });
+}
+
+/**
+ *
+ * @param dom
+ */
+function asWindow(dom: JSDOM): Window {
+ return dom.window as unknown as Window;
+}
+
+describe('extractMetaFromDocument', () => {
+ it('extracts , lang and basic ', async () => {
+ const html = `
+
+
+ Example Title
+
+
+
+
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.title).toBe('Example Title');
+ expect(meta.lang).toBe('ja');
+ expect(meta.description).toBe('An example page');
+ expect(meta.keywords).toBe('a, b, c');
+ });
+
+ it('parses og:* and twitter:* meta tags', async () => {
+ const html = `
+
+
+ OG
+
+
+
+
+
+
+
+
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.og?.title).toBe('OG Title');
+ expect(meta.og?.type).toBe('article');
+ expect(meta.og?.image).toEqual([
+ 'https://example.com/a.png',
+ 'https://example.com/b.png',
+ ]);
+ expect(meta.twitter?.card).toBe('summary_large_image');
+ expect(meta.twitter?.site).toBe('@example');
+ });
+
+ it('parses viewport, robots and theme-color (with media branches)', async () => {
+ const html = `
+
+
+ X
+
+
+
+
+
+
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.viewport?.width).toBe('device-width');
+ expect(meta.viewport?.initialScale).toBe(1);
+ expect(meta.robots?.noindex).toBe(true);
+ expect(meta.robots?.nofollow).toBe(true);
+ expect(meta.themeColor).toBe('#000000');
+ expect(meta.themeColorDark).toBe('#111111');
+ expect(meta.themeColorLight).toBe('#eeeeee');
+ });
+
+ it('captures and alternate hreflang', async () => {
+ const html = `
+
+
+ L
+
+
+
+
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.link?.canonical).toBe('https://example.com/canonical');
+ const hreflangs = meta.link?.alternateHreflang.map((e) => e.hreflang) ?? [];
+ expect(hreflangs).toEqual(['en', 'ja']);
+ });
+
+ it('parses inline JSON-LD scripts', async () => {
+ const data = { '@context': 'https://schema.org', '@type': 'WebPage', name: 'X' };
+ const html = `
+
+
+ J
+
+
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.jsonLd).toHaveLength(1);
+ const first = meta.jsonLd[0];
+ expect(first?.parsed).toEqual(data);
+ });
+
+ it('captures itemtype/itemscope (microdata) and prefix/vocab (RDFa) from ', async () => {
+ const html = `
+
+ M
+ `;
+ const dom = mkDom(html);
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+
+ expect(meta.microdata?.itemscope).toBe(true);
+ expect(meta.microdata?.itemtype).toBe('https://schema.org/WebPage');
+ expect(meta.rdfa?.prefix).toBe('og: https://ogp.me/ns#');
+ expect(meta.rdfa?.vocab).toBe('https://schema.org/');
+ expect(meta.rdfa?.typeOf).toBe('WebPage');
+ });
+
+ it('captures and