diff --git a/cspell.json b/cspell.json index f9806324..41e8cb66 100644 --- a/cspell.json +++ b/cspell.json @@ -40,6 +40,59 @@ "elif", // code-review effort level - "xhigh" + "xhigh", + + // HTML meta tag terms (frontmatter-keys.md) + "Fediverse", + "DCTERMS", + "dcterms", + "rdfa", + "Rdfa", + "noimageindex", + "noodp", + "noydir", + "indexifembedded", + "ahrefs", + "detectify", + "seznam", + "nositelinkssearchbox", + "nopagereadaloud", + "adsense", + "nopin", + "rels", + "gbfs", + "lrdd", + + // Wappalyzer (technology fingerprint library) + "wappalyzer", + "Wappalyzer", + + // Analytics / tag provider names + identifiers + "pintrk", + "optimizely", + "Optimizely", + "Tawk", + "tawk", + "grecaptcha", + "hjid", + "hjsv", + "fbevents", + "Matomo", + "matomo", + "sdkid", + "zdassets", + "cbid", + "XYZW", + + // Microsoft verification / DCTERMS keys + "msvalidate", + "conformsto", + "isformatof", + "ispartof", + "isreferencedby", + "isreplacedby", + "isrequiredby", + "isversionof", + "tableofcontents" ] } diff --git a/packages/@d-zero/beholder/package.json b/packages/@d-zero/beholder/package.json index 7c67c7d0..bd2712a1 100644 --- a/packages/@d-zero/beholder/package.json +++ b/packages/@d-zero/beholder/package.json @@ -23,7 +23,8 @@ "@d-zero/puppeteer-page-scan": "4.5.1", "@d-zero/shared": "0.22.0", "debug": "4.4.3", - "puppeteer": "24.37.5" + "puppeteer": "24.37.5", + "simple-wappalyzer": "1.1.99" }, "devDependencies": { "@types/debug": "4.1.12" diff --git a/packages/@d-zero/beholder/src/dom-evaluation.spec.ts b/packages/@d-zero/beholder/src/dom-evaluation.spec.ts index 0921e2cb..f71cead5 100644 --- a/packages/@d-zero/beholder/src/dom-evaluation.spec.ts +++ b/packages/@d-zero/beholder/src/dom-evaluation.spec.ts @@ -12,6 +12,7 @@ import { getMeta, getProp, } from './dom-evaluation.js'; +import { emptyMeta } from './meta/classify.js'; afterEach(() => { vi.useRealTimers(); @@ -41,68 +42,30 @@ function mockElementHandle(value: unknown): ElementHandle { } describe('getMeta', () => { - it('maps raw evaluation result into a Meta object and parses robots directives', async () => { - const page = mockPageEvaluate({ - title: 'Example', - lang: 'ja', - description: 'desc', - keywords: 'a,b', - robots: 'noindex, NOFOLLOW', - canonical: 'https://example.com/', - alternate: 'https://example.com/en', - 'og:type': 'website', - 'og:title': 'OG Title', - 'og:site_name': 'Site', - 'og:description': 'OG desc', - 'og:url': 'https://example.com/', - 'og:image': 'https://example.com/img.png', - 'twitter:card': 'summary', - }); - - const meta = await getMeta(page); - - expect(meta).toStrictEqual({ - title: 'Example', - lang: 'ja', - description: 'desc', - keywords: 'a,b', - noindex: true, - nofollow: true, - noarchive: false, - canonical: 'https://example.com/', - alternate: 'https://example.com/en', - 'og:type': 'website', - 'og:title': 'OG Title', - 'og:site_name': 'Site', - 'og:description': 'OG desc', - 'og:url': 'https://example.com/', - 'og:image': 'https://example.com/img.png', - 'twitter:card': 'summary', - }); - }); - - it('returns a minimal fallback when evaluation rejects', async () => { + it('returns emptyMeta() when page.evaluate rejects', async () => { const page = { evaluate: () => Promise.reject(new Error('execution context destroyed')), + content: () => Promise.resolve(''), } as unknown as Page; - const meta = await getMeta(page); + const meta = await getMeta(page, { url: 'https://example.com/' }); - expect(meta).toStrictEqual({ title: '' }); + expect(meta).toEqual(emptyMeta()); }); - it('returns a minimal fallback when the main thread is unresponsive (timeout)', async () => { + it('returns emptyMeta() when the main thread is unresponsive (timeout)', async () => { vi.useFakeTimers(); const page = { // Never resolves — simulates a blocked main thread. evaluate: () => new Promise(() => {}), + content: () => new Promise(() => {}), } as unknown as Page; - const promise = getMeta(page, 5000); + const promise = getMeta(page, { url: 'https://example.com/' }, 5000); await vi.advanceTimersByTimeAsync(5000); const meta = await promise; - expect(meta).toStrictEqual({ title: '' }); + expect(meta).toEqual(emptyMeta()); expect(vi.getTimerCount()).toBe(0); }); }); diff --git a/packages/@d-zero/beholder/src/dom-evaluation.ts b/packages/@d-zero/beholder/src/dom-evaluation.ts index 829df689..6efa8c93 100644 --- a/packages/@d-zero/beholder/src/dom-evaluation.ts +++ b/packages/@d-zero/beholder/src/dom-evaluation.ts @@ -14,12 +14,15 @@ * @see {@link ./types.ts} for the data types returned by these functions */ +import type { RawHeadEntry } from './meta/types.js'; import type { AnchorData, ImageElement, Meta, ParseURLOptions } from './types.js'; import type { CDPSession, ElementHandle, Page } from 'puppeteer'; import { raceWithTimeout } from '@d-zero/shared/race-with-timeout'; import { domDetailsLog, domLog } from './debug.js'; +import { classify, emptyMeta } from './meta/classify.js'; +import { detectTags } from './meta/tag-detection.js'; import { parseUrl } from './parse-url.js'; const pid = `${process.pid}`; @@ -490,81 +493,275 @@ async function resolveAnchor( } /** - * Extracts comprehensive meta information from the page's ``. + * Required context for {@link getMeta}. Provided by the scraper from data it + * already has on hand (URL it navigated to, response status/headers it received). * - * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips - * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive - * page) a minimal `{ title: '' }` is returned rather than hanging. + * `html` is optional: when omitted, `getMeta` falls back to `page.content()` + * to obtain the rendered HTML for the third-party tag detection pass. + */ +export type GetMetaContext = { + /** The fully resolved URL of the page (after redirects). */ + readonly url: string; + /** Rendered HTML. Falls back to `page.content()` when omitted. */ + readonly html?: string; + /** Response status code, surfaced to the Wappalyzer driver. */ + readonly statusCode?: number; + /** Response headers; case is preserved by the caller, lowercased internally. */ + readonly headers?: Record; + /** + * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for + * debugging. Default `false` to keep the serialized payload small. + */ + readonly includeRaw?: boolean; +}; + +const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [ + 'dataLayer', + 'gtag', + 'ga', + '_gaq', + 'fbq', + '_fbq', + 'clarity', + '_hjSettings', + '_hjid', + 'twq', + 'ttq', + '_linkedin_partner_id', + 'pintrk', + 'amplitude', + 'mixpanel', + 'analytics', + 'heap', + 'posthog', + 'plausible', + 'fathom', + '_paq', + 's_account', + 's', + 'ym', + 'UET', + 'optimizely', + '_hsq', + 'Sentry', + 'Intercom', + 'intercomSettings', + 'drift', + 'Tawk_API', + 'zE', + 'OneTrust', + 'Cookiebot', + 'Stripe', + 'grecaptcha', +]; + +/** + * Extracts comprehensive metadata from the page. * - * Collected metadata: - * - `title` - The document title. - * - `lang` - The `lang` attribute of the `` element. - * - `description` - The `` content. - * - `keywords` - The `` content. - * - `noindex` / `nofollow` / `noarchive` - Parsed from the `` directives. - * - `canonical` - The `` content. - * - `alternate` - The `` content. - * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`. - * - `twitter:card` - The Twitter Card type. - * @param page - The Puppeteer page to extract meta information from. - * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}. - * @returns An object containing all extracted meta properties. + * Two passes happen in parallel: + * 1. Browser-side `collectHead()` serializes every ``, ``, + * relevant ``; + expect(extractIds('Google Analytics', html)).toContain('G-ABCD1234XY'); + }); + + it('extracts GA4 measurement ID from script src', () => { + const html = ``; + expect(extractIds('Google Analytics', html)).toContain('G-XYZW9876AB'); + }); + + it('extracts UA tracking ID', () => { + const html = ``; + expect(extractIds('Google Analytics', html)).toContain('UA-12345678-1'); + }); + + it('extracts GTM container ID from src and inline', () => { + const html = ` + + + `; + const ids = extractIds('Google Tag Manager', html); + expect(ids).toContain('GTM-ABCD123'); + expect(ids.length).toBe(1); + }); + + it('extracts Facebook Pixel ID from fbq init', () => { + const html = ``; + expect(extractIds('Facebook Pixel', html)).toContain('123456789012345'); + }); + + it('extracts Hotjar site ID from inline', () => { + const html = ``; + expect(extractIds('Hotjar', html)).toContain('1234567'); + }); + + it('extracts Microsoft Clarity project ID from src', () => { + const html = ``; + expect(extractIds('Microsoft Clarity', html)).toContain('abc123xyz'); + }); + + it('extracts TikTok pixel ID from ttq.load', () => { + const html = ``; + expect(extractIds('TikTok Pixel', html)).toContain('ABCDEFGH12345678'); + }); + + it('deduplicates IDs across multiple patterns', () => { + const html = ` + + + `; + const ids = extractIds('Google Analytics', html); + const dupCount = ids.filter((id) => id === 'G-DUP12345A').length; + expect(dupCount).toBe(1); + }); + + it('extracts Yandex Metrica counter ID from ym init', () => { + const html = ``; + expect(extractIds('Yandex Metrica', html)).toContain('12345678'); + }); +}); diff --git a/packages/@d-zero/beholder/src/meta/id-extractors.ts b/packages/@d-zero/beholder/src/meta/id-extractors.ts new file mode 100644 index 00000000..96a8ffa0 --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/id-extractors.ts @@ -0,0 +1,206 @@ +/** + * Provider-specific real-ID extraction rules. + * + * `simple-wappalyzer` identifies the *technology* (e.g., "Google Analytics") but + * does not surface the actual account/measurement ID. We layer real-ID + * extraction on top: for each detected provider, apply the registered regex + * over the page HTML and surface what we find. + * + * Provider keys must match the names produced by `simple-wappalyzer` exactly; + * these in turn track `wappalyzer-core@6` (the MIT-licensed fingerprint set). + * + * Keep the table **manually maintained**, not generated from Wappalyzer data. + * @module + */ + +export type IdExtractor = { + /** + * Each regex MUST contain at most one capturing group; the captured text + * becomes the ID. Patterns without a capturing group fall back to + * `match[0]`. + */ + readonly patterns: readonly RegExp[]; +}; + +/** + * Lookup table keyed by Wappalyzer provider name. + * + * When extending: keep regexes anchored on stable, high-signal substrings + * (the surrounding API call, not just the bare ID character class). Otherwise + * the same regex will hit unrelated strings on pages that happen to share the + * shape (e.g., AWS ARNs containing `GA-...`). + */ +export const ID_EXTRACTORS: Record = { + 'Google Analytics': { + patterns: [ + /gtag\(\s*['"]config['"]\s*,\s*['"](G-[A-Z0-9]{4,20})['"]/g, + /googletagmanager\.com\/gtag\/js\?id=(G-[A-Z0-9]{4,20})/g, + /\bga\(\s*['"]create['"]\s*,\s*['"](UA-\d{4,10}-\d{1,4})['"]/g, + /['"](UA-\d{4,10}-\d{1,4})['"]/g, + ], + }, + 'Google Tag Manager': { + patterns: [ + /googletagmanager\.com\/(?:gtm|ns)\.[a-z]+\?id=(GTM-[A-Z0-9]{4,12})/g, + /['"](GTM-[A-Z0-9]{4,12})['"]/g, + ], + }, + 'Google Ads': { + patterns: [/['"](AW-\d{4,12})['"]/g], + }, + 'Facebook Pixel': { + patterns: [ + /fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g, + /connect\.facebook\.net\/[^"']+\/fbevents\.js\D*(\d{6,20})/g, + ], + }, + Hotjar: { + patterns: [ + /hjid\s*[:=]\s*(\d{4,10})/g, + /static\.hotjar\.com\/c\/hotjar-(\d{4,10})\.js/g, + ], + }, + 'Microsoft Clarity': { + patterns: [ + /clarity\.ms\/tag\/([a-z0-9]{6,20})/g, + /clarity\(\s*['"]start['"]\s*,\s*['"]([a-z0-9]{6,20})['"]/gi, + ], + }, + Mixpanel: { + patterns: [/mixpanel\.init\(\s*['"]([a-f0-9]{16,40})['"]/g], + }, + Segment: { + patterns: [ + /analytics\.load\(\s*['"]([a-zA-Z0-9]{8,40})['"]/g, + /cdn\.segment\.com\/analytics\.js\/v1\/([a-zA-Z0-9]{8,40})/g, + ], + }, + Amplitude: { + patterns: [ + /amplitude\.init\(\s*['"]([a-f0-9]{16,40})['"]/g, + /getInstance\(\)\.init\(\s*['"]([a-f0-9]{16,40})['"]/g, + ], + }, + Heap: { + patterns: [ + /heap\.load\(\s*['"](\d{6,20})['"]/g, + /heap\.appid\s*=\s*['"](\d{6,20})['"]/g, + ], + }, + PostHog: { + patterns: [/posthog\.init\(\s*['"]([\w-]{16,80})['"]/g], + }, + Plausible: { + patterns: [/plausible\.io\/js\/script\.js[?&]domain=([a-zA-Z0-9.,-]+)/g], + }, + Matomo: { + patterns: [ + /_paq\.push\(\s*\[\s*['"]setSiteId['"]\s*,\s*['"]?(\d{1,6})['"]?\s*\]/g, + /matomo\.php\?siteId=(\d{1,6})/g, + ], + }, + 'Adobe Analytics': { + patterns: [ + /s_account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi, + /s\.account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi, + ], + }, + 'Yandex Metrica': { + patterns: [/ym\(\s*(\d{6,12})\s*,\s*['"]init['"]/g], + }, + 'LinkedIn Insight Tag': { + patterns: [/_linkedin_partner_id\s*=\s*['"](\d{4,10})['"]/g], + }, + 'Twitter Ads': { + patterns: [/twq\(\s*['"]config['"]\s*,\s*['"]([a-z0-9]{4,12})['"]/g], + }, + 'TikTok Pixel': { + patterns: [ + /ttq\.load\(\s*['"]([A-Z0-9]{12,30})['"]/g, + /tiktok\.com\/i18n\/pixel\/events\.js\?sdkid=([A-Z0-9]{12,30})/g, + ], + }, + 'Pinterest Tag': { + patterns: [/pintrk\(\s*['"]load['"]\s*,\s*['"](\d{12,20})['"]/g], + }, + 'Bing Universal Event Tracking': { + patterns: [ + /setAttribute\(\s*['"]data-tag['"]\s*,\s*['"](\d{6,20})['"]/g, + /UET\(\{\s*ti:\s*['"](\d{6,20})['"]/g, + ], + }, + Optimizely: { + patterns: [/cdn\.optimizely\.com\/js\/(\d{6,20})\.js/g], + }, + HubSpot: { + patterns: [ + /js\.hs-?scripts\.com\/(\d{4,12})\.js/g, + /js\.hubspot\.com\/web-interactives\/v1\/embeds\/(\d{4,12})/g, + ], + }, + Sentry: { + patterns: [ + /(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.ingest\.sentry\.io\/\d+)/g, + /(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.sentry\.io\/\d+)/g, + ], + }, + Intercom: { + patterns: [ + /intercomSettings\s*=\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g, + /Intercom\(\s*['"]boot['"]\s*,\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g, + ], + }, + Drift: { + patterns: [/drift\.load\(\s*['"]([a-z0-9]{6,30})['"]/g], + }, + 'Tawk.to': { + patterns: [/embed\.tawk\.to\/([a-f0-9]{16,40})/g], + }, + 'Zendesk Chat': { + patterns: [/static\.zdassets\.com\/ekr\/snippet\.js\?key=([a-f0-9-]{16,40})/g], + }, + Cookiebot: { + patterns: [/consent\.cookiebot\.com\/uc\.js[^"']*?cbid=([a-f0-9-]{16,40})/g], + }, + OneTrust: { + patterns: [/dataDomain['"=]\s*['"]?([a-z0-9-]{16,80})['"]?/gi], + }, + Stripe: { + patterns: [/js\.stripe\.com\/v\d+\//g], + }, + 'Google reCAPTCHA': { + patterns: [/google\.com\/recaptcha\/api\.js[^"']*?(?:render=)?([\w-]{20,60})/g], + }, + 'Facebook for WordPress': { + patterns: [/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g], + }, +}; + +/** + * Extracts real IDs for `provider` from the page HTML. + * + * Returns a de-duplicated, insertion-ordered list of IDs. Returns `[]` for + * unknown providers (so callers can compose freely). + * @param provider + * @param html + */ +export function extractIds(provider: string, html: string): string[] { + const extractor = ID_EXTRACTORS[provider]; + if (!extractor) return []; + const seen = new Set(); + const result: string[] = []; + for (const pattern of extractor.patterns) { + // Patterns must be `g`-flagged for `matchAll` to work without re-creating. + const safe = pattern.flags.includes('g') + ? pattern + : new RegExp(pattern.source, pattern.flags + 'g'); + for (const match of html.matchAll(safe)) { + const id = match[1] ?? match[0]; + if (id && !seen.has(id)) { + seen.add(id); + result.push(id); + } + } + } + return result; +} diff --git a/packages/@d-zero/beholder/src/meta/keys.ts b/packages/@d-zero/beholder/src/meta/keys.ts new file mode 100644 index 00000000..9cc74e33 --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/keys.ts @@ -0,0 +1,568 @@ +/** + * Lookup tables mapping ``, ``, ``, + * ``, and `` to their dot-path in `Meta`. + * + * Each key has a single canonical lowercase form. Cross-reference keys + * (e.g., `format-detection` writes to both `formatDetection.*` and + * `apple.formatDetectionTelephone`) use `paths` with more than one entry. + * + * Values referenced from `frontmatter-keys.md` in `../../frontend-env/`. + * @module + */ + +export type KeyTransform = + | 'string' + | 'number' + | 'boolean-yes' + | 'boolean-on' + | 'boolean-true'; + +export type KeyDef = { + /** One or more dot-paths under `Meta` to write the value into. */ + readonly paths: readonly string[]; + /** When `true`, repeated occurrences accumulate into an array at the path. */ + readonly multi?: boolean; + /** Value normalization to apply. Defaults to `'string'`. */ + readonly transform?: KeyTransform; +}; + +/** Defines how a `` is stored under `Meta.link`. */ +export type LinkRelDef = { + /** Dot-path under `Meta.link` (e.g., `'canonical'`, `'preload'`). */ + readonly path: string; + /** + * `'single'` keeps the first; `'href-only'` stores the href string only; + * `'array'` accumulates `LinkEntry[]`; `'icon-sized'` accumulates only when + * `sizes` is set. + */ + readonly cardinality: 'single' | 'href-only' | 'array' | 'icon-sized'; +}; + +/** `` → dot-path in `Meta`. */ +export const META_NAME_MAP: Record = { + 'application-name': { paths: ['applicationName'] }, + author: { paths: ['author'] }, + description: { paths: ['description'] }, + generator: { paths: ['generator'] }, + keywords: { paths: ['keywords'] }, + creator: { paths: ['creator'] }, + publisher: { paths: ['publisher'] }, + 'theme-color': { paths: ['themeColor'] }, + 'color-scheme': { paths: ['colorScheme'] }, + 'supported-color-schemes': { paths: ['supportedColorSchemes'] }, + googlebot: { paths: ['googlebot'] }, + 'googlebot-news': { paths: ['googlebotNews'] }, + 'googlebot-image': { paths: ['googlebotImage'] }, + 'googlebot-video': { paths: ['googlebotVideo'] }, + bingbot: { paths: ['bingbot'] }, + slurp: { paths: ['slurp'] }, + duckduckbot: { paths: ['duckduckbot'] }, + yandex: { paths: ['yandex'] }, + baiduspider: { paths: ['baiduspider'] }, + ia_archiver: { paths: ['iaArchiver'] }, + 'revisit-after': { paths: ['revisitAfter'] }, + rating: { paths: ['rating'] }, + distribution: { paths: ['distribution'] }, + classification: { paths: ['classification'] }, + category: { paths: ['category'] }, + subject: { paths: ['subject'] }, + topic: { paths: ['topic'] }, + summary: { paths: ['summary'] }, + abstract: { paths: ['abstract'] }, + audience: { paths: ['audience'] }, + target: { paths: ['target'] }, + copyright: { paths: ['copyright'] }, + designer: { paths: ['designer'] }, + owner: { paths: ['owner'] }, + 'reply-to': { paths: ['replyTo'] }, + contact: { paths: ['contact'] }, + 'identifier-url': { paths: ['identifierUrl'] }, + language: { paths: ['language'] }, + revision: { paths: ['revision'] }, + build: { paths: ['build'] }, + version: { paths: ['version'] }, + handheldfriendly: { + paths: ['handheldFriendly', 'mobile.handheldFriendly', 'legacy.handheldFriendly'], + }, + mobileoptimized: { + paths: ['mobileOptimized', 'mobile.mobileOptimized', 'legacy.mobileOptimized'], + }, + 'mobile-web-app-capable': { paths: ['mobileWebAppCapable'] }, + 'application-url': { paths: ['applicationUrl'] }, + theme: { paths: ['theme'] }, + + // Apple iOS + 'apple-mobile-web-app-capable': { + paths: ['apple.mobileWebAppCapable'], + transform: 'boolean-yes', + }, + 'apple-mobile-web-app-status-bar-style': { + paths: ['apple.mobileWebAppStatusBarStyle'], + }, + 'apple-mobile-web-app-title': { paths: ['apple.mobileWebAppTitle'] }, + 'apple-touch-fullscreen': { + paths: ['apple.touchFullscreen'], + transform: 'boolean-yes', + }, + 'apple-itunes-app': { paths: ['apple.itunesApp'] }, + 'apple-mobile-web-app-orientations': { paths: ['apple.mobileWebAppOrientations'] }, + 'apple-touch-icon-title': { paths: ['apple.touchIconTitle'] }, + 'apple-touch-startup-image': { paths: ['apple.touchStartupImage'] }, + + // Microsoft + 'msapplication-tilecolor': { paths: ['msapplication.tileColor'] }, + 'msapplication-tileimage': { paths: ['msapplication.tileImage'] }, + 'msapplication-config': { paths: ['msapplication.config', 'msapplication.configFile'] }, + 'msapplication-navbutton-color': { paths: ['msapplication.navbuttonColor'] }, + 'msapplication-square70x70logo': { paths: ['msapplication.square70x70logo'] }, + 'msapplication-square150x150logo': { paths: ['msapplication.square150x150logo'] }, + 'msapplication-square310x310logo': { paths: ['msapplication.square310x310logo'] }, + 'msapplication-wide310x150logo': { paths: ['msapplication.wide310x150logo'] }, + 'msapplication-starturl': { paths: ['msapplication.starturl'] }, + 'msapplication-window': { paths: ['msapplication.window'] }, + 'msapplication-task': { paths: ['msapplication.task'], multi: true }, + 'msapplication-task-separator': { paths: ['msapplication.taskSeparator'] }, + 'msapplication-tooltip': { paths: ['msapplication.tooltip'] }, + 'msapplication-notification': { paths: ['msapplication.notification'] }, + 'msapplication-badge': { paths: ['msapplication.badge'] }, + 'msapplication-tap-highlight': { paths: ['msapplication.tapHighlight'] }, + 'msapplication-allowdomainapicalls': { paths: ['msapplication.allowDomainApiCalls'] }, + 'msapplication-allowdomainmetatags': { paths: ['msapplication.allowDomainMetaTags'] }, + mssmarttagspreventparsing: { + paths: ['msapplication.smartTagsPreventParsing', 'legacy.msSmartTagsPreventParsing'], + }, + ie_rm_off: { paths: ['msapplication.ieRmOff'] }, + + // Verification + 'google-site-verification': { paths: ['verification.google'] }, + 'msvalidate.01': { paths: ['verification.bing'] }, + 'yandex-verification': { paths: ['verification.yandex'] }, + 'baidu-site-verification': { paths: ['verification.baidu'] }, + 'naver-site-verification': { paths: ['verification.naver'] }, + 'p:domain_verify': { paths: ['verification.pinterest'] }, + 'facebook-domain-verification': { paths: ['verification.facebook'] }, + alexaverifyid: { paths: ['verification.alexa'] }, + 'norton-safeweb-site-verification': { paths: ['verification.norton'] }, + 'ahrefs-site-verification': { paths: ['verification.ahrefs'] }, + 'detectify-verification': { paths: ['verification.detectify'] }, + 'zoho-verification': { paths: ['verification.zoho'] }, + 'wot-verification': { paths: ['verification.wot'] }, + 'seznam-wmt': { paths: ['verification.seznam'] }, + 'shopify-checkout-api-token': { paths: ['verification.shopify'] }, + 'brave-rewards-verification': { paths: ['verification.brave'] }, + + // Google-specific + 'google-translate-customization': { paths: ['google.translateCustomization'] }, + 'google-adsense-account': { paths: ['google.adsenseAccount'] }, + 'google-play-app': { paths: ['google.playApp'] }, + + // Dublin Core + 'dc.title': { paths: ['dc.title'] }, + 'dc.creator': { paths: ['dc.creator'] }, + 'dc.subject': { paths: ['dc.subject'] }, + 'dc.description': { paths: ['dc.description'] }, + 'dc.publisher': { paths: ['dc.publisher'] }, + 'dc.contributor': { paths: ['dc.contributor'] }, + 'dc.date': { paths: ['dc.date'] }, + 'dc.type': { paths: ['dc.type'] }, + 'dc.format': { paths: ['dc.format'] }, + 'dc.identifier': { paths: ['dc.identifier'] }, + 'dc.source': { paths: ['dc.source'] }, + 'dc.language': { paths: ['dc.language'] }, + 'dc.relation': { paths: ['dc.relation'] }, + 'dc.coverage': { paths: ['dc.coverage'] }, + 'dc.rights': { paths: ['dc.rights'] }, + + // DC Terms + 'dcterms.abstract': { paths: ['dcterms.abstract'] }, + 'dcterms.accessrights': { paths: ['dcterms.accessRights'] }, + 'dcterms.accrualmethod': { paths: ['dcterms.accrualMethod'] }, + 'dcterms.accrualperiodicity': { paths: ['dcterms.accrualPeriodicity'] }, + 'dcterms.accrualpolicy': { paths: ['dcterms.accrualPolicy'] }, + 'dcterms.alternative': { paths: ['dcterms.alternative'] }, + 'dcterms.audience': { paths: ['dcterms.audience'] }, + 'dcterms.available': { paths: ['dcterms.available'] }, + 'dcterms.bibliographiccitation': { paths: ['dcterms.bibliographicCitation'] }, + 'dcterms.conformsto': { paths: ['dcterms.conformsTo'] }, + 'dcterms.created': { paths: ['dcterms.created'] }, + 'dcterms.dateaccepted': { paths: ['dcterms.dateAccepted'] }, + 'dcterms.datecopyrighted': { paths: ['dcterms.dateCopyrighted'] }, + 'dcterms.datesubmitted': { paths: ['dcterms.dateSubmitted'] }, + 'dcterms.educationlevel': { paths: ['dcterms.educationLevel'] }, + 'dcterms.extent': { paths: ['dcterms.extent'] }, + 'dcterms.hasformat': { paths: ['dcterms.hasFormat'] }, + 'dcterms.haspart': { paths: ['dcterms.hasPart'] }, + 'dcterms.hasversion': { paths: ['dcterms.hasVersion'] }, + 'dcterms.instructionalmethod': { paths: ['dcterms.instructionalMethod'] }, + 'dcterms.isformatof': { paths: ['dcterms.isFormatOf'] }, + 'dcterms.ispartof': { paths: ['dcterms.isPartOf'] }, + 'dcterms.isreferencedby': { paths: ['dcterms.isReferencedBy'] }, + 'dcterms.isreplacedby': { paths: ['dcterms.isReplacedBy'] }, + 'dcterms.isrequiredby': { paths: ['dcterms.isRequiredBy'] }, + 'dcterms.issued': { paths: ['dcterms.issued'] }, + 'dcterms.isversionof': { paths: ['dcterms.isVersionOf'] }, + 'dcterms.license': { paths: ['dcterms.license'] }, + 'dcterms.mediator': { paths: ['dcterms.mediator'] }, + 'dcterms.medium': { paths: ['dcterms.medium'] }, + 'dcterms.modified': { paths: ['dcterms.modified'] }, + 'dcterms.provenance': { paths: ['dcterms.provenance'] }, + 'dcterms.references': { paths: ['dcterms.references'] }, + 'dcterms.replaces': { paths: ['dcterms.replaces'] }, + 'dcterms.requires': { paths: ['dcterms.requires'] }, + 'dcterms.rightsholder': { paths: ['dcterms.rightsHolder'] }, + 'dcterms.spatial': { paths: ['dcterms.spatial'] }, + 'dcterms.tableofcontents': { paths: ['dcterms.tableOfContents'] }, + 'dcterms.temporal': { paths: ['dcterms.temporal'] }, + 'dcterms.valid': { paths: ['dcterms.valid'] }, + + // Geo + 'geo.region': { paths: ['geo.region'] }, + 'geo.placename': { paths: ['geo.placename'] }, + 'geo.position': { paths: ['geo.position'] }, + 'geo.country': { paths: ['geo.country'] }, + 'geo.a1': { paths: ['geo.a1'] }, + 'geo.a2': { paths: ['geo.a2'] }, + 'geo.a3': { paths: ['geo.a3'] }, + 'geo.lmk': { paths: ['geo.lmk'] }, + icbm: { paths: ['icbm'] }, + + // Citation + citation_title: { paths: ['citation.title'] }, + citation_author: { paths: ['citation.author'], multi: true }, + citation_author_email: { paths: ['citation.authorEmail'], multi: true }, + citation_author_institution: { paths: ['citation.authorInstitution'], multi: true }, + citation_publication_date: { paths: ['citation.publicationDate'] }, + citation_date: { paths: ['citation.date'] }, + citation_journal_title: { paths: ['citation.journalTitle'] }, + citation_journal_abbrev: { paths: ['citation.journalAbbrev'] }, + citation_conference_title: { paths: ['citation.conferenceTitle'] }, + citation_publisher: { paths: ['citation.publisher'] }, + citation_volume: { paths: ['citation.volume'] }, + citation_issue: { paths: ['citation.issue'] }, + citation_firstpage: { paths: ['citation.firstpage'] }, + citation_lastpage: { paths: ['citation.lastpage'] }, + citation_doi: { paths: ['citation.doi'] }, + citation_isbn: { paths: ['citation.isbn'] }, + citation_issn: { paths: ['citation.issn'] }, + citation_language: { paths: ['citation.language'] }, + citation_keywords: { paths: ['citation.keywords'] }, + citation_pdf_url: { paths: ['citation.pdfUrl'] }, + citation_fulltext_html_url: { paths: ['citation.fulltextHtmlUrl'] }, + citation_dissertation_institution: { paths: ['citation.dissertationInstitution'] }, + citation_technical_report_institution: { + paths: ['citation.technicalReportInstitution'], + }, + citation_technical_report_number: { paths: ['citation.technicalReportNumber'] }, + + // CSRF + 'csrf-param': { paths: ['csrfParam'] }, + 'csrf-token': { paths: ['csrfToken'] }, + + // Misc + 'go-import': { paths: ['goImport'] }, + bitcoin: { paths: ['bitcoin'] }, + 'origin-trial': { paths: ['originTrial'], multi: true }, + monetization: { paths: ['monetization'] }, + 'payment-pointer': { paths: ['paymentPointer'] }, + 'amp-experiments-opt-in': { paths: ['ampExperimentsOptIn', 'amp.experimentsOptIn'] }, + 'amp-google-client-id-api': { paths: ['ampGoogleClientIdApi'] }, + + // Pinterest + 'pinterest-rich-pin': { paths: ['pinterest.richPin'], transform: 'boolean-true' }, + pinterest: { paths: ['pinterest.nopin'], transform: 'boolean-true' }, + + // Legacy + imagetoolbar: { paths: ['legacy.imagetoolbar'] }, + 'page-version': { paths: ['legacy.pageVersion'] }, + 'resource-type': { paths: ['legacy.resourceType'] }, + 'doc-class': { paths: ['legacy.docClass'] }, + 'doc-rights': { paths: ['legacy.docRights'] }, + 'doc-type': { paths: ['legacy.docType'] }, + + // Mobile-specific + 'mobile-agent': { paths: ['mobile.mobileAgent'] }, + 'full-screen': { paths: ['mobile.fullScreen'] }, + browsermode: { paths: ['mobile.browsermode'] }, + 'x5-orientation': { paths: ['mobile.x5Orientation'] }, + 'x5-fullscreen': { paths: ['mobile.x5Fullscreen'] }, + 'x5-page-mode': { paths: ['mobile.x5PageMode'] }, + 'screen-orientation': { paths: ['mobile.screenOrientation'] }, + layoutmode: { paths: ['mobile.layoutmode'] }, + imagemode: { paths: ['mobile.imagemode'] }, + + // Twitter Cards (treated as name in HTML even though logically property-like) + 'twitter:card': { paths: ['twitter.card'] }, + 'twitter:site': { paths: ['twitter.site'] }, + 'twitter:site:id': { paths: ['twitter.siteId'] }, + 'twitter:creator': { paths: ['twitter.creator'] }, + 'twitter:creator:id': { paths: ['twitter.creatorId'] }, + 'twitter:title': { paths: ['twitter.title'] }, + 'twitter:description': { paths: ['twitter.description'] }, + 'twitter:image': { paths: ['twitter.image'] }, + 'twitter:image:src': { paths: ['twitter.imageSrc'] }, + 'twitter:image:alt': { paths: ['twitter.imageAlt'] }, + 'twitter:image:width': { paths: ['twitter.imageWidth'] }, + 'twitter:image:height': { paths: ['twitter.imageHeight'] }, + 'twitter:url': { paths: ['twitter.url'] }, + 'twitter:domain': { paths: ['twitter.domain'] }, + 'twitter:player': { paths: ['twitter.player'] }, + 'twitter:player:width': { paths: ['twitter.playerWidth'] }, + 'twitter:player:height': { paths: ['twitter.playerHeight'] }, + 'twitter:player:stream': { paths: ['twitter.playerStream'] }, + 'twitter:player:stream:content_type': { paths: ['twitter.playerStreamContentType'] }, + 'twitter:app:name:iphone': { paths: ['twitter.appNameIphone'] }, + 'twitter:app:id:iphone': { paths: ['twitter.appIdIphone'] }, + 'twitter:app:url:iphone': { paths: ['twitter.appUrlIphone'] }, + 'twitter:app:name:ipad': { paths: ['twitter.appNameIpad'] }, + 'twitter:app:id:ipad': { paths: ['twitter.appIdIpad'] }, + 'twitter:app:url:ipad': { paths: ['twitter.appUrlIpad'] }, + 'twitter:app:name:googleplay': { paths: ['twitter.appNameGoogleplay'] }, + 'twitter:app:id:googleplay': { paths: ['twitter.appIdGoogleplay'] }, + 'twitter:app:url:googleplay': { paths: ['twitter.appUrlGoogleplay'] }, + 'twitter:app:country': { paths: ['twitter.appCountry'] }, + 'twitter:label1': { paths: ['twitter.label1'] }, + 'twitter:data1': { paths: ['twitter.data1'] }, + 'twitter:label2': { paths: ['twitter.label2'] }, + 'twitter:data2': { paths: ['twitter.data2'] }, + 'twitter:widgets:csp': { paths: ['twitter.widgetsCsp'] }, + 'twitter:widgets:new-embed-design': { paths: ['twitter.widgetsNewEmbedDesign'] }, + 'twitter:dnt': { paths: ['twitter.dnt'] }, + + // Experimental / vendor + 'darkreader-lock': { + paths: ['experimental.darkreaderLock'], + transform: 'boolean-true', + }, + 'turbo-cache-control': { paths: ['experimental.turboCacheControl'] }, + 'turbo-visit-control': { paths: ['experimental.turboVisitControl'] }, + 'view-transition': { paths: ['experimental.viewTransition'] }, + + // Wiki + resourceloaderdynamicstyles: { paths: ['wiki.resourceLoaderDynamicStyles'] }, +}; + +/** `` → dot-path in `Meta`. */ +export const META_PROPERTY_MAP: Record = { + 'og:title': { paths: ['og.title'] }, + 'og:type': { paths: ['og.type'] }, + 'og:url': { paths: ['og.url'] }, + 'og:site_name': { paths: ['og.siteName'] }, + 'og:description': { paths: ['og.description'] }, + 'og:determiner': { paths: ['og.determiner'] }, + 'og:locale': { paths: ['og.locale'] }, + 'og:locale:alternate': { paths: ['og.localeAlternate'], multi: true }, + + 'og:image': { paths: ['og.image'], multi: true }, + 'og:image:url': { paths: ['og.imageUrl'] }, + 'og:image:secure_url': { paths: ['og.imageSecureUrl'] }, + 'og:image:type': { paths: ['og.imageType'] }, + 'og:image:width': { paths: ['og.imageWidth'] }, + 'og:image:height': { paths: ['og.imageHeight'] }, + 'og:image:alt': { paths: ['og.imageAlt'] }, + + 'og:video': { paths: ['og.video'], multi: true }, + 'og:video:url': { paths: ['og.videoUrl'] }, + 'og:video:secure_url': { paths: ['og.videoSecureUrl'] }, + 'og:video:type': { paths: ['og.videoType'] }, + 'og:video:width': { paths: ['og.videoWidth'] }, + 'og:video:height': { paths: ['og.videoHeight'] }, + 'og:video:alt': { paths: ['og.videoAlt'] }, + + 'og:audio': { paths: ['og.audio'], multi: true }, + 'og:audio:url': { paths: ['og.audioUrl'] }, + 'og:audio:secure_url': { paths: ['og.audioSecureUrl'] }, + 'og:audio:type': { paths: ['og.audioType'] }, + + 'article:published_time': { paths: ['og.article.publishedTime'] }, + 'article:modified_time': { paths: ['og.article.modifiedTime'] }, + 'article:expiration_time': { paths: ['og.article.expirationTime'] }, + 'article:author': { paths: ['og.article.author'], multi: true }, + 'article:section': { paths: ['og.article.section'] }, + 'article:tag': { paths: ['og.article.tag'], multi: true }, + 'article:publisher': { paths: ['og.article.publisher'] }, + + 'book:author': { paths: ['og.book.author'], multi: true }, + 'book:isbn': { paths: ['og.book.isbn'] }, + 'book:release_date': { paths: ['og.book.releaseDate'] }, + 'book:tag': { paths: ['og.book.tag'], multi: true }, + + 'profile:first_name': { paths: ['og.profile.firstName'] }, + 'profile:last_name': { paths: ['og.profile.lastName'] }, + 'profile:username': { paths: ['og.profile.username'] }, + 'profile:gender': { paths: ['og.profile.gender'] }, + + 'music:duration': { paths: ['og.music.duration'] }, + 'music:album': { paths: ['og.music.album'], multi: true }, + 'music:album:disc': { paths: ['og.music.albumDisc'] }, + 'music:album:track': { paths: ['og.music.albumTrack'] }, + 'music:musician': { paths: ['og.music.musician'], multi: true }, + 'music:song': { paths: ['og.music.song'], multi: true }, + 'music:song:disc': { paths: ['og.music.songDisc'] }, + 'music:song:track': { paths: ['og.music.songTrack'] }, + 'music:release_date': { paths: ['og.music.releaseDate'] }, + 'music:creator': { paths: ['og.music.creator'], multi: true }, + + 'video:actor': { paths: ['og.videoNs.actor'], multi: true }, + 'video:actor:role': { paths: ['og.videoNs.actorRole'] }, + 'video:director': { paths: ['og.videoNs.director'], multi: true }, + 'video:writer': { paths: ['og.videoNs.writer'], multi: true }, + 'video:duration': { paths: ['og.videoNs.duration'] }, + 'video:release_date': { paths: ['og.videoNs.releaseDate'] }, + 'video:tag': { paths: ['og.videoNs.tag'], multi: true }, + 'video:series': { paths: ['og.videoNs.series'] }, + + 'fb:app_id': { paths: ['fb.appId'] }, + 'fb:admins': { paths: ['fb.admins'], multi: true }, + 'fb:pages': { paths: ['fb.pages'], multi: true }, + + 'fediverse:creator': { paths: ['fediverse.creator'] }, +}; + +/** `` → dot-path in `Meta.httpEquiv`. */ +export const HTTP_EQUIV_MAP: Record = { + 'content-type': { paths: ['httpEquiv.contentType'] }, + 'content-language': { paths: ['httpEquiv.contentLanguage'] }, + 'default-style': { paths: ['httpEquiv.defaultStyle'] }, + refresh: { paths: ['httpEquiv.refresh'] }, + 'x-ua-compatible': { paths: ['httpEquiv.xUaCompatible'] }, + 'content-security-policy': { paths: ['httpEquiv.contentSecurityPolicy'] }, + 'content-security-policy-report-only': { + paths: ['httpEquiv.contentSecurityPolicyReportOnly'], + }, + 'set-cookie': { paths: ['httpEquiv.setCookie'] }, + pragma: { paths: ['httpEquiv.pragma'] }, + 'cache-control': { paths: ['httpEquiv.cacheControl'] }, + expires: { paths: ['httpEquiv.expires'] }, + 'accept-ch': { paths: ['httpEquiv.acceptCh'] }, + 'delegate-ch': { paths: ['httpEquiv.delegateCh'] }, + 'permissions-policy': { + paths: ['httpEquiv.permissionsPolicy', 'httpEquiv.permissionsPolicyValue'], + }, + 'origin-trial': { + paths: ['httpEquiv.originTrial', 'httpEquiv.originTrialToken'], + multi: true, + }, + 'x-dns-prefetch-control': { paths: ['httpEquiv.xDnsPrefetchControl'] }, + 'window-target': { paths: ['httpEquiv.windowTarget'] }, + imagetoolbar: { paths: ['httpEquiv.imagetoolbar'] }, + cleartype: { paths: ['httpEquiv.cleartype', 'msapplication.cleartype'] }, +}; + +/** `` → dot-path in `Meta.itemprop`. */ +export const ITEMPROP_MAP: Record = { + name: { paths: ['itemprop.name'] }, + description: { paths: ['itemprop.description'] }, + image: { paths: ['itemprop.image'] }, +}; + +/** `` → dot-path in `Meta.link`. */ +export const LINK_REL_MAP: Record = { + canonical: { path: 'canonical', cardinality: 'href-only' }, + alternate: { path: 'alternateHreflang', cardinality: 'array' }, + amphtml: { path: 'amphtml', cardinality: 'href-only' }, + author: { path: 'author', cardinality: 'href-only' }, + bookmark: { path: 'bookmark', cardinality: 'href-only' }, + help: { path: 'help', cardinality: 'href-only' }, + license: { path: 'license', cardinality: 'href-only' }, + next: { path: 'next', cardinality: 'href-only' }, + prev: { path: 'prev', cardinality: 'href-only' }, + previous: { path: 'previous', cardinality: 'href-only' }, + first: { path: 'first', cardinality: 'href-only' }, + last: { path: 'last', cardinality: 'href-only' }, + up: { path: 'up', cardinality: 'href-only' }, + index: { path: 'index', cardinality: 'href-only' }, + contents: { path: 'contents', cardinality: 'href-only' }, + start: { path: 'start', cardinality: 'href-only' }, + search: { path: 'search', cardinality: 'single' }, + tag: { path: 'tag', cardinality: 'array' }, + archives: { path: 'archives', cardinality: 'array' }, + publisher: { path: 'publisher', cardinality: 'href-only' }, + 'privacy-policy': { path: 'privacyPolicy', cardinality: 'href-only' }, + 'terms-of-service': { path: 'termsOfService', cardinality: 'href-only' }, + copyright: { path: 'copyright', cardinality: 'href-only' }, + appendix: { path: 'appendix', cardinality: 'array' }, + chapter: { path: 'chapter', cardinality: 'array' }, + section: { path: 'section', cardinality: 'array' }, + subsection: { path: 'subsection', cardinality: 'array' }, + glossary: { path: 'glossary', cardinality: 'href-only' }, + profile: { path: 'profile', cardinality: 'array' }, + edituri: { path: 'editUri', cardinality: 'href-only' }, + pingback: { path: 'pingback', cardinality: 'href-only' }, + webmention: { path: 'webmention', cardinality: 'href-only' }, + micropub: { path: 'micropub', cardinality: 'href-only' }, + microsub: { path: 'microsub', cardinality: 'href-only' }, + me: { path: 'me', cardinality: 'array' }, + authorization_endpoint: { path: 'authorizationEndpoint', cardinality: 'href-only' }, + token_endpoint: { path: 'tokenEndpoint', cardinality: 'href-only' }, + 'indieauth-metadata': { path: 'indieauthMetadata', cardinality: 'href-only' }, + 'openid.server': { path: 'openidServer', cardinality: 'href-only' }, + 'openid.delegate': { path: 'openidDelegate', cardinality: 'href-only' }, + 'openid2.provider': { path: 'openid2Provider', cardinality: 'href-only' }, + 'openid2.local_id': { path: 'openid2LocalId', cardinality: 'href-only' }, + hub: { path: 'hub', cardinality: 'href-only' }, + self: { path: 'self', cardinality: 'href-only' }, + payment: { path: 'payment', cardinality: 'href-only' }, + enclosure: { path: 'enclosure', cardinality: 'array' }, + external: { path: 'external', cardinality: 'array' }, + nofollow: { path: 'nofollow', cardinality: 'array' }, + sponsored: { path: 'sponsored', cardinality: 'array' }, + ugc: { path: 'ugc', cardinality: 'array' }, + noopener: { path: 'noopener', cardinality: 'array' }, + noreferrer: { path: 'noreferrer', cardinality: 'array' }, + opener: { path: 'opener', cardinality: 'array' }, + image_src: { path: 'imageSrc', cardinality: 'href-only' }, + shortlink: { path: 'shortlink', cardinality: 'href-only' }, + 'dns-prefetch': { path: 'dnsPrefetch', cardinality: 'array' }, + preconnect: { path: 'preconnect', cardinality: 'array' }, + prefetch: { path: 'prefetch', cardinality: 'array' }, + prerender: { path: 'prerender', cardinality: 'array' }, + preload: { path: 'preload', cardinality: 'array' }, + modulepreload: { path: 'modulepreload', cardinality: 'array' }, + expect: { path: 'expect', cardinality: 'array' }, + stylesheet: { path: 'stylesheet', cardinality: 'array' }, + manifest: { path: 'manifest', cardinality: 'href-only' }, + serviceworker: { path: 'serviceworker', cardinality: 'href-only' }, + dpp: { path: 'dpp', cardinality: 'href-only' }, + gbfs: { path: 'gbfs', cardinality: 'href-only' }, + syndication: { path: 'syndication', cardinality: 'array' }, + 'api-catalog': { path: 'apiCatalog', cardinality: 'href-only' }, + memento: { path: 'memento', cardinality: 'href-only' }, + timegate: { path: 'timegate', cardinality: 'href-only' }, + timemap: { path: 'timemap', cardinality: 'href-only' }, + 'version-history': { path: 'versionHistory', cardinality: 'href-only' }, + 'latest-version': { path: 'latestVersion', cardinality: 'href-only' }, + 'predecessor-version': { path: 'predecessorVersion', cardinality: 'href-only' }, + 'successor-version': { path: 'successorVersion', cardinality: 'href-only' }, + 'working-copy': { path: 'workingCopy', cardinality: 'href-only' }, + 'working-copy-of': { path: 'workingCopyOf', cardinality: 'href-only' }, + describedby: { path: 'describedby', cardinality: 'href-only' }, + describes: { path: 'describes', cardinality: 'href-only' }, + via: { path: 'via', cardinality: 'href-only' }, + related: { path: 'related', cardinality: 'array' }, + 'cite-as': { path: 'citeAs', cardinality: 'href-only' }, + disclosure: { path: 'disclosure', cardinality: 'href-only' }, + status: { path: 'status', cardinality: 'href-only' }, + sunset: { path: 'sunset', cardinality: 'href-only' }, + deprecation: { path: 'deprecation', cardinality: 'href-only' }, + lrdd: { path: 'lrdd', cardinality: 'href-only' }, + hosts: { path: 'hosts', cardinality: 'href-only' }, + service: { path: 'service', cardinality: 'href-only' }, + 'service-desc': { path: 'serviceDesc', cardinality: 'href-only' }, + 'service-doc': { path: 'serviceDoc', cardinality: 'href-only' }, + 'service-meta': { path: 'serviceMeta', cardinality: 'href-only' }, + 'c2pa-manifest': { path: 'c2paManifest', cardinality: 'href-only' }, + 'compression-dictionary': { path: 'compressionDictionary', cardinality: 'href-only' }, + + icon: { path: 'icon', cardinality: 'single' }, + 'shortcut icon': { path: 'shortcutIcon', cardinality: 'href-only' }, + 'apple-touch-icon': { path: 'appleTouchIcon', cardinality: 'single' }, + 'apple-touch-icon-precomposed': { + path: 'appleTouchIconPrecomposed', + cardinality: 'array', + }, + 'apple-touch-startup-image': { path: 'appleTouchStartupImage', cardinality: 'array' }, + 'mask-icon': { path: 'maskIcon', cardinality: 'single' }, + 'fluid-icon': { path: 'fluidIcon', cardinality: 'single' }, + + 'security.txt': { path: 'securityTxt', cardinality: 'href-only' }, +}; diff --git a/packages/@d-zero/beholder/src/meta/parsers.spec.ts b/packages/@d-zero/beholder/src/meta/parsers.spec.ts new file mode 100644 index 00000000..5c580b69 --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/parsers.spec.ts @@ -0,0 +1,178 @@ +import { describe, expect, it } from 'vitest'; + +import { + capJsonLdContent, + JSON_LD_PER_ENTRY_LIMIT, + normalizeValue, + parseFormatDetection, + parseJsonLd, + parseRefresh, + parseReferrer, + parseRobots, + parseViewport, +} from './parsers.js'; + +describe('parseViewport', () => { + it('parses width=device-width and initial-scale', () => { + const result = parseViewport('width=device-width, initial-scale=1.0'); + expect(result.raw).toBe('width=device-width, initial-scale=1.0'); + expect(result.width).toBe('device-width'); + expect(result.initialScale).toBe(1); + }); + + it('parses user-scalable=no as boolean false', () => { + const result = parseViewport('user-scalable=no'); + expect(result.userScalable).toBe(false); + }); + + it('parses minimum-scale/maximum-scale as numbers', () => { + const result = parseViewport('minimum-scale=0.5, maximum-scale=2'); + expect(result.minimumScale).toBe(0.5); + expect(result.maximumScale).toBe(2); + }); + + it('preserves viewport-fit and interactive-widget literally', () => { + const result = parseViewport('viewport-fit=cover, interactive-widget=resizes-visual'); + expect(result.viewportFit).toBe('cover'); + expect(result.interactiveWidget).toBe('resizes-visual'); + }); + + it('keeps raw on unrecognizable input', () => { + const result = parseViewport('garbage'); + expect(result.raw).toBe('garbage'); + expect(result.width).toBeUndefined(); + }); +}); + +describe('parseRobots', () => { + it('flags noindex/nofollow/noarchive', () => { + const result = parseRobots('noindex, NOFOLLOW, noarchive'); + expect(result.noindex).toBe(true); + expect(result.nofollow).toBe(true); + expect(result.noarchive).toBe(true); + }); + + it('extracts max-snippet, max-image-preview, max-video-preview', () => { + const result = parseRobots( + 'max-snippet:50, max-image-preview:large, max-video-preview:120', + ); + expect(result.maxSnippet).toBe(50); + expect(result.maxImagePreview).toBe('large'); + expect(result.maxVideoPreview).toBe(120); + }); + + it('extracts unavailable_after', () => { + const result = parseRobots('unavailable_after:2026-12-31'); + expect(result.unavailableAfter).toBe('2026-12-31'); + }); + + it('flags index/follow positives', () => { + const result = parseRobots('index, follow'); + expect(result.index).toBe(true); + expect(result.follow).toBe(true); + }); +}); + +describe('parseReferrer', () => { + it('flags strict-origin-when-cross-origin', () => { + const result = parseReferrer('strict-origin-when-cross-origin'); + expect(result.strictOriginWhenCrossOrigin).toBe(true); + }); + + it('flags no-referrer', () => { + const result = parseReferrer('no-referrer'); + expect(result.noReferrer).toBe(true); + }); +}); + +describe('parseFormatDetection', () => { + it('parses telephone=no, address=no', () => { + const result = parseFormatDetection('telephone=no, address=no'); + expect(result.telephone).toBe(false); + expect(result.address).toBe(false); + }); + + it('parses date=no via semicolon separator', () => { + const result = parseFormatDetection('telephone=no; date=no'); + expect(result.telephone).toBe(false); + expect(result.date).toBe(false); + }); +}); + +describe('parseRefresh', () => { + it('parses seconds and url', () => { + const result = parseRefresh('5; url=https://example.com/'); + expect(result.seconds).toBe(5); + expect(result.url).toBe('https://example.com/'); + }); + + it('handles missing url', () => { + const result = parseRefresh('30'); + expect(result.seconds).toBe(30); + expect(result.url).toBeUndefined(); + }); + + it('strips surrounding quotes in url', () => { + const result = parseRefresh(`0; url='https://example.com/'`); + expect(result.url).toBe('https://example.com/'); + }); +}); + +describe('parseJsonLd', () => { + it('returns parsed object on valid JSON', () => { + const entry = parseJsonLd('{"@type":"WebSite","name":"Site"}'); + expect(entry.parsed).toEqual({ '@type': 'WebSite', name: 'Site' }); + expect(entry.parseError).toBeUndefined(); + }); + + it('records parseError on invalid JSON', () => { + const entry = parseJsonLd('{ not valid }'); + expect(entry.parsed).toBeUndefined(); + expect(entry.parseError).toBeDefined(); + }); +}); + +describe('normalizeValue', () => { + it('passes through string by default', () => { + expect(normalizeValue('hello')).toBe('hello'); + expect(normalizeValue('hello', 'string')).toBe('hello'); + }); + + it('boolean-yes maps yes/no', () => { + expect(normalizeValue('yes', 'boolean-yes')).toBe(true); + expect(normalizeValue('no', 'boolean-yes')).toBe(false); + expect(normalizeValue('unknown', 'boolean-yes')).toBe('unknown'); + }); + + it('boolean-on maps on/off/true/false/1/0', () => { + expect(normalizeValue('on', 'boolean-on')).toBe(true); + expect(normalizeValue('off', 'boolean-on')).toBe(false); + expect(normalizeValue('true', 'boolean-on')).toBe(true); + expect(normalizeValue('0', 'boolean-on')).toBe(false); + }); + + it('boolean-true maps true/false only', () => { + expect(normalizeValue('true', 'boolean-true')).toBe(true); + expect(normalizeValue('false', 'boolean-true')).toBe(false); + expect(normalizeValue('1', 'boolean-true')).toBe('1'); + }); + + it('number parses floats and falls back to raw', () => { + expect(normalizeValue('3.14', 'number')).toBe(3.14); + expect(normalizeValue('NaN-ish', 'number')).toBe('NaN-ish'); + }); +}); + +describe('capJsonLdContent', () => { + it('returns content unchanged when under the limit', () => { + const result = capJsonLdContent('{}'); + expect(result).toEqual({ content: '{}', truncated: false }); + }); + + it('truncates content over the per-entry limit', () => { + const big = 'a'.repeat(JSON_LD_PER_ENTRY_LIMIT + 100); + const result = capJsonLdContent(big); + expect(result.truncated).toBe(true); + expect(result.content.length).toBe(JSON_LD_PER_ENTRY_LIMIT); + }); +}); diff --git a/packages/@d-zero/beholder/src/meta/parsers.ts b/packages/@d-zero/beholder/src/meta/parsers.ts new file mode 100644 index 00000000..07ef920c --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/parsers.ts @@ -0,0 +1,304 @@ +/** + * Value normalizers used by `classify()` to turn raw `content` strings into + * structured objects (viewport, robots, format-detection, etc.). + * + * Each parser is a pure function that takes the raw `content` string and + * returns a normalized structure. They never throw; on unrecognizable input + * they fall back to keeping the `raw` field only. + * @module + */ + +import type { KeyTransform } from './keys.js'; +import type { + FormatDetectionMeta, + HttpEquivRefresh, + JsonLdEntry, + ReferrerMeta, + RobotsMeta, + ViewportMeta, +} from './types.js'; + +/** + * Parses `` content into a structured `ViewportMeta`. + * @param raw + * @example parseViewport('width=device-width, initial-scale=1.0') + * → { raw: '...', width: 'device-width', initialScale: 1 } + */ +export function parseViewport(raw: string): ViewportMeta { + const meta: ViewportMeta = { raw }; + for (const part of raw.split(',')) { + const split = part.split('='); + const keyRaw = split[0] ?? ''; + const valueRaw = split[1] ?? ''; + const key = keyRaw.trim().toLowerCase(); + const value = valueRaw.trim(); + if (!key) continue; + switch (key) { + case 'width': { + meta.width = value; + break; + } + case 'height': { + meta.height = value; + break; + } + case 'initial-scale': { + const n = Number.parseFloat(value); + if (!Number.isNaN(n)) meta.initialScale = n; + break; + } + case 'minimum-scale': { + const n = Number.parseFloat(value); + if (!Number.isNaN(n)) meta.minimumScale = n; + break; + } + case 'maximum-scale': { + const n = Number.parseFloat(value); + if (!Number.isNaN(n)) meta.maximumScale = n; + break; + } + case 'user-scalable': { + const lower = value.toLowerCase(); + if (lower === 'no' || lower === '0') meta.userScalable = false; + else if (lower === 'yes' || lower === '1') meta.userScalable = true; + else meta.userScalable = value; + break; + } + case 'viewport-fit': { + meta.viewportFit = value; + break; + } + case 'interactive-widget': { + meta.interactiveWidget = value; + break; + } + } + } + return meta; +} + +const ROBOTS_BOOLEAN_FLAGS = new Set([ + 'index', + 'noindex', + 'follow', + 'nofollow', + 'none', + 'all', + 'noarchive', + 'nosnippet', + 'noimageindex', + 'nocache', + 'notranslate', + 'noodp', + 'noydir', + 'indexifembedded', +]); + +/** + * Parses `` content into a structured `RobotsMeta`. + * @param raw + * @example parseRobots('noindex, max-snippet:50, unavailable_after:2026-01-01') + * → { raw: '...', noindex: true, maxSnippet: 50, unavailableAfter: '2026-01-01' } + */ +export function parseRobots(raw: string): RobotsMeta { + const meta: RobotsMeta = { raw }; + for (const token of raw.split(',')) { + const trimmed = token.trim().toLowerCase(); + if (!trimmed) continue; + + if (ROBOTS_BOOLEAN_FLAGS.has(trimmed as keyof RobotsMeta)) { + (meta as Record)[trimmed] = true; + continue; + } + + const colonIndex = trimmed.indexOf(':'); + if (colonIndex === -1) { + continue; + } + const key = trimmed.slice(0, colonIndex).trim(); + const value = token.slice(token.indexOf(':') + 1).trim(); + switch (key) { + case 'max-snippet': { + const n = Number.parseInt(value, 10); + if (!Number.isNaN(n)) meta.maxSnippet = n; + break; + } + case 'max-image-preview': { + meta.maxImagePreview = value; + break; + } + case 'max-video-preview': { + const n = Number.parseInt(value, 10); + if (!Number.isNaN(n)) meta.maxVideoPreview = n; + break; + } + case 'unavailable_after': + case 'unavailable-after': { + meta.unavailableAfter = value; + break; + } + } + } + return meta; +} + +const REFERRER_POLICY_KEYS: Record = { + 'no-referrer': 'noReferrer', + origin: 'origin', + 'origin-when-cross-origin': 'originWhenCrossOrigin', + 'strict-origin': 'strictOrigin', + 'strict-origin-when-cross-origin': 'strictOriginWhenCrossOrigin', + 'unsafe-url': 'unsafeUrl', + 'same-origin': 'sameOrigin', + 'no-referrer-when-downgrade': 'noReferrerWhenDowngrade', +}; + +/** + * Parses `` content into a structured `ReferrerMeta`. + * @param raw + */ +export function parseReferrer(raw: string): ReferrerMeta { + const meta: ReferrerMeta = { raw }; + const key = REFERRER_POLICY_KEYS[raw.trim().toLowerCase()]; + if (key) { + (meta as Record)[key] = true; + } + return meta; +} + +/** + * Parses `` content (e.g. `'telephone=no, address=no'`). + * @param raw + */ +export function parseFormatDetection(raw: string): FormatDetectionMeta { + const meta: FormatDetectionMeta = { raw }; + for (const part of raw.split(/[,;]/)) { + const split = part.split('='); + const keyRaw = split[0] ?? ''; + const valueRaw = split[1] ?? ''; + const key = keyRaw.trim().toLowerCase(); + const value = valueRaw.trim().toLowerCase(); + if (!key) continue; + const enabled = value !== 'no' && value !== 'false' && value !== '0'; + switch (key) { + case 'telephone': { + meta.telephone = enabled; + break; + } + case 'email': { + meta.email = enabled; + break; + } + case 'address': { + meta.address = enabled; + break; + } + case 'date': { + meta.date = enabled; + break; + } + } + } + return meta; +} + +/** + * Parses `` content (e.g. `'5; url=https://...'`). + * @param raw + */ +export function parseRefresh(raw: string): HttpEquivRefresh { + const refresh: HttpEquivRefresh = { raw }; + const split = raw.split(';'); + const secondsRaw = split[0] ?? ''; + const rest = split.slice(1).join(';'); + const seconds = Number.parseFloat(secondsRaw.trim()); + if (!Number.isNaN(seconds)) { + refresh.seconds = seconds; + } + const urlMatch = /url\s*=\s*(.+)/i.exec(rest); + if (urlMatch?.[1]) { + refresh.url = urlMatch[1].trim().replaceAll(/^['"]|['"]$/g, ''); + } + return refresh; +} + +/** + * Parses a ``; + const tags = assembleTagsMeta( + [ + { + name: 'Google Analytics', + categories: [{ name: 'Analytics' }], + }, + ], + html, + ); + expect(tags.detected.Analytics?.['Google Analytics']?.ids).toEqual([ + 'G-XYZ123', + 'G-AAA999', + ]); + const providerEntries = tags.entries.filter((e) => e.provider === 'Google Analytics'); + expect(providerEntries.map((e) => e.id)).toEqual(['G-XYZ123', 'G-AAA999']); + }); + + it('emits one entry without id when no IDs are extracted', () => { + const tags = assembleTagsMeta( + [ + { + name: 'jQuery', + version: '3.6.0', + categories: [{ name: 'JavaScript Libraries' }], + }, + ], + '', + ); + expect(tags.entries).toHaveLength(1); + expect(tags.entries[0]?.id).toBeUndefined(); + expect(tags.entries[0]?.version).toBe('3.6.0'); + }); + + it('falls back to "Other" category when no categories are present', () => { + const tags = assembleTagsMeta([{ name: 'Unknown', categories: [] }], ''); + expect(tags.detected['Other']?.['Unknown']).toBeDefined(); + }); + + it('skips detections without a name', () => { + const tags = assembleTagsMeta( + [{ name: '', categories: [{ name: 'Analytics' }] }], + '', + ); + expect(tags.entries).toHaveLength(0); + expect(Object.keys(tags.detected)).toHaveLength(0); + }); +}); + +describe('detectTags', () => { + it('falls back to empty TagsMeta when simple-wappalyzer throws', async () => { + wappalyzerMock.mockRejectedValueOnce(new Error('wappalyzer boom')); + const result = await detectTags({ + url: 'https://example.com/', + html: '', + }); + expect(result).toEqual(EMPTY_TAGS_META); + }); + + it('falls back to empty TagsMeta when simple-wappalyzer returns non-array', async () => { + wappalyzerMock.mockResolvedValueOnce(null as unknown as never); + const result = await detectTags({ + url: 'https://example.com/', + html: '', + }); + expect(result).toEqual(EMPTY_TAGS_META); + }); + + it('passes detections through assembleTagsMeta', async () => { + wappalyzerMock.mockResolvedValueOnce([ + { + name: 'Google Analytics', + version: 'GA4', + categories: [{ name: 'Analytics' }], + }, + ] as never); + const result = await detectTags({ + url: 'https://example.com/', + html: ``, + }); + expect(result.entries).toHaveLength(1); + expect(result.entries[0]?.id).toBe('G-XYZ123'); + }); + + it('normalizes headers to lowercase before calling wappalyzer', async () => { + wappalyzerMock.mockResolvedValueOnce([] as never); + await detectTags({ + url: 'https://example.com/', + html: '', + headers: { 'Content-Type': 'text/html', 'X-Custom': ['a', 'b'] }, + }); + const arg = wappalyzerMock.mock.calls.at(-1)?.[0] as { + headers?: Record; + }; + expect(arg?.headers?.['content-type']).toBe('text/html'); + expect(arg?.headers?.['x-custom']).toBe('a, b'); + }); +}); diff --git a/packages/@d-zero/beholder/src/meta/tag-detection.ts b/packages/@d-zero/beholder/src/meta/tag-detection.ts new file mode 100644 index 00000000..b7f9c330 --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/tag-detection.ts @@ -0,0 +1,161 @@ +/** + * Third-party tag detection layer. + * + * Combines two signals to populate {@link TagsMeta}: + * 1. `simple-wappalyzer` runs over the page HTML + headers to identify + * the technologies present (and their Wappalyzer categories). + * 2. {@link extractIds} from `./id-extractors.js` finds the real account + * / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected + * provider. + * + * Returned shape is documented on {@link TagsMeta} in `./types.ts`. + * @module + */ + +import type { TagDetail, TagEntry, TagsMeta } from './types.js'; + +import wappalyzer from 'simple-wappalyzer'; + +import { domLog } from '../debug.js'; + +import { extractIds } from './id-extractors.js'; + +const log = domLog.extend(`${process.pid}`); + +/** + * Shape of a single technology entry returned by `simple-wappalyzer`. + * Mirrors the subset of fields we use; everything else is ignored. + */ +interface WappalyzerTech { + readonly name: string; + readonly version?: string; + readonly confidence?: number; + readonly categories?: ReadonlyArray<{ readonly name?: string; readonly id?: number }>; +} + +/** + * Inputs required to drive `simple-wappalyzer`. + * + * `headers` keys should be lowercase; `simple-wappalyzer` is case-insensitive + * but normalizing up front avoids ambiguity. + */ +export type DetectTagsInput = { + readonly url: string; + readonly html: string; + readonly statusCode?: number; + readonly headers?: Record; +}; + +const EMPTY_TAGS: TagsMeta = { detected: {}, entries: [] }; + +/** + * Drives `simple-wappalyzer` and post-processes the result with the + * provider-specific ID extractors. Failures fall back to an empty `TagsMeta` + * rather than throwing, so the caller does not need to wrap the call. + * @param input + */ +export async function detectTags(input: DetectTagsInput): Promise { + const headers = normalizeHeaders(input.headers); + let detections: WappalyzerTech[]; + try { + const result = (await wappalyzer({ + url: input.url, + html: input.html, + headers, + })) as unknown; + detections = Array.isArray(result) ? (result as WappalyzerTech[]) : []; + } catch (error) { + log( + 'detectTags: simple-wappalyzer failed; returning empty TagsMeta. Error: %O', + error, + ); + return cloneEmpty(); + } + return assembleTagsMeta(detections, input.html); +} + +/** + * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page + * HTML used for ID extraction. + * + * Exported for unit tests that bypass `simple-wappalyzer` and feed + * pre-recorded detections directly. + * @param detections + * @param html + */ +export function assembleTagsMeta( + detections: readonly WappalyzerTech[], + html: string, +): TagsMeta { + const detected: Record> = {}; + const entries: TagEntry[] = []; + + for (const tech of detections) { + if (!tech.name) continue; + const ids = extractIds(tech.name, html); + const categories = + tech.categories + ?.map((c) => c.name) + .filter((name): name is string => typeof name === 'string') ?? []; + const detail: TagDetail = { + ids, + ...(tech.version === undefined ? {} : { version: tech.version }), + ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }), + }; + for (const category of categories.length > 0 ? categories : ['Other']) { + if (detected[category] === undefined) { + detected[category] = {}; + } + detected[category][tech.name] = detail; + } + + const baseSources = [{ type: 'html' as const }]; + if (ids.length === 0) { + entries.push({ + provider: tech.name, + categories, + ...(tech.version === undefined ? {} : { version: tech.version }), + ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }), + sources: baseSources, + }); + } else { + for (const id of ids) { + entries.push({ + provider: tech.name, + categories, + id, + ...(tech.version === undefined ? {} : { version: tech.version }), + ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }), + sources: baseSources, + }); + } + } + } + + return { detected, entries }; +} + +/** + * + */ +function cloneEmpty(): TagsMeta { + return { detected: {}, entries: [] }; +} + +/** + * + * @param headers + */ +function normalizeHeaders(headers: DetectTagsInput['headers']): Record { + if (!headers) return {}; + const out: Record = {}; + for (const [key, value] of Object.entries(headers)) { + if (value === undefined) continue; + const flat = Array.isArray(value) ? value.join(', ') : value; + out[key.toLowerCase()] = flat; + } + return out; +} + +/** Singleton empty `TagsMeta` value (exported for tests). */ +export const EMPTY_TAGS_META = EMPTY_TAGS; diff --git a/packages/@d-zero/beholder/src/meta/types.ts b/packages/@d-zero/beholder/src/meta/types.ts new file mode 100644 index 00000000..6947633b --- /dev/null +++ b/packages/@d-zero/beholder/src/meta/types.ts @@ -0,0 +1,949 @@ +/** + * Type definitions for the `Meta` data extracted from a page's `` and full document. + * + * Structure follows the reference table in `frontmatter-keys.md`, with one dot-path + * field per category. Optional fields are absent when not detected on the page. + * Array fields are required and default to `[]` so consumers can iterate without + * null-checks. + * @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries + * @see {@link ./parsers.ts} for the value normalizers used by `classify` + * @module + */ + +/** + * Top-level metadata extracted from a page's `` and surrounding markup + * (``, ``, `