From 625f2f1fa7c897ad155154ae312263c757f7091e Mon Sep 17 00:00:00 2001 From: khaiwang Date: Tue, 9 Jun 2026 14:42:18 -0400 Subject: [PATCH 1/2] fix(modeling): allow GPU-less meta-load of remote models that hard-import CUDA kernels Remote modeling code for some HuggingFace models (e.g. Nemotron-H / nvidia/NVIDIA-Nemotron-3-*) imports CUDA-only kernel packages such as mamba_ssm and causal_conv1d at module-import time. Those packages cannot be installed or imported without a CUDA toolchain, so a GPU-less nnsight/NDIF client could not even construct the meta model, and therefore could not build an intervention graph or run remotely. A meta model never executes a forward pass on the client (the forward runs on the GPU host), so these kernels are only imported, never called. Add meta_kernel_shim(), a context manager that registers inert stand-ins for the kernel packages around the meta from_config() -- only when they are not really installed, and removed immediately after. Each stub member raises if called, so a dispatched/real run can never silently use a stub instead of the real kernel. Also default trust_remote_code=True consistently across _load_config, _load_meta and _load so the config class, the meta intervention tree, and the dispatched/served model are all the same implementation. Previously only the meta from_config() forced trust_remote_code, which could build the client tree from a different implementation than the one dispatched, and left _load_config using a native config that cannot parse newer remote-only config fields (e.g. Nemotron-H hybrid_override_pattern block types). Verified: GPU-less meta-load now succeeds for Nemotron-3 4B and 120B; the meta tree built without kernels is byte-identical (same module/param signature) to the tree built with kernels; a real 4B load+trace matches plain-transformers logits and is byte-identical between this path and the prior CUDA path; interventions behave identically; gpt2 load/trace is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/nnsight/modeling/_kernel_shim.py | 136 +++++++++++++++++++++++++++ src/nnsight/modeling/transformers.py | 26 ++++- 2 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 src/nnsight/modeling/_kernel_shim.py diff --git a/src/nnsight/modeling/_kernel_shim.py b/src/nnsight/modeling/_kernel_shim.py new file mode 100644 index 00000000..1ecfcbb6 --- /dev/null +++ b/src/nnsight/modeling/_kernel_shim.py @@ -0,0 +1,136 @@ +"""Meta-load shim for CUDA-only kernel packages. + +Some HuggingFace models ship custom remote modeling code (``trust_remote_code``) +that *hard-imports* CUDA/Triton kernel packages at module top level — e.g. +``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B`` does:: + + from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn # raises on CPU + +Those packages (``mamba_ssm``, ``causal_conv1d``) cannot even be *installed* +without a CUDA toolchain, so a GPU-less nnsight/NDIF client cannot import the +model class — which means it cannot build the meta module tree it needs to +compile an intervention graph, and therefore cannot use ``remote=True`` at all. + +On a meta-only client the model never runs a forward pass (the real forward +happens on the GPU host), so these kernels are never *called* — they are only +*imported*. This module installs inert stand-ins for the kernel packages around +the meta ``from_config`` call so the import resolves; every stand-in member +raises if it is ever actually invoked, so a dispatched/real run can never +silently use a stub instead of the real kernel. + +The shim is a **no-op when the real kernel is installed** (e.g. on a GPU host), +so it never shadows a genuine implementation. +""" + +from __future__ import annotations + +import contextlib +import importlib.machinery +import importlib.util +import os +import sys +import types +from typing import Dict, List + +# top-level package -> {fully-qualified submodule it exposes: [members to stub]}. +# Only the members that remote modeling files import *unconditionally* strictly +# need stubbing; the rest are harmless and make the shim robust across the +# several Mamba-family remote files that copy this import block. +_KERNEL_STUBS: Dict[str, Dict[str, List[str]]] = { + "mamba_ssm": { + "mamba_ssm.ops.triton.layernorm_gated": ["rmsnorm_fn"], + "mamba_ssm.ops.triton.selective_state_update": ["selective_state_update"], + "mamba_ssm.ops.triton.ssd_combined": [ + "mamba_chunk_scan_combined", + "mamba_split_conv1d_scan_combined", + ], + }, + "causal_conv1d": { + "causal_conv1d": ["causal_conv1d_fn", "causal_conv1d_update"], + }, +} + +# Set NNSIGHT_FORCE_META_KERNEL_SHIM=1 to install the stubs even when the real +# kernels are importable. Used to exercise the GPU-less client path on a machine +# that happens to have the kernels installed; not needed in normal operation. +_FORCE_ENV = "NNSIGHT_FORCE_META_KERNEL_SHIM" + + +def _make_stub_member(qualified_name: str): + def _stub(*args, **kwargs): + raise RuntimeError( + f"{qualified_name} is an nnsight meta-load stub standing in for a " + f"CUDA kernel that is not installed. It must not be called: build the " + f"model on a meta-only client (no forward), or dispatch it on a CUDA " + f"host with the real kernel package installed to actually run it." + ) + + _stub.__name__ = qualified_name.rsplit(".", 1)[-1] + return _stub + + +def _register_module(name: str, is_package: bool, members: List[str], added: List[str]) -> types.ModuleType: + module = types.ModuleType(name) + spec = importlib.machinery.ModuleSpec(name, loader=None) + if is_package: + spec.submodule_search_locations = [] + module.__path__ = [] # marks it importable as a package + module.__spec__ = spec + for member in members: + setattr(module, member, _make_stub_member(f"{name}.{member}")) + sys.modules[name] = module + added.append(name) + return module + + +@contextlib.contextmanager +def meta_kernel_shim(force: bool | None = None): + """Temporarily satisfy CUDA-only kernel imports for meta construction. + + Installs lightweight stand-ins for any package in :data:`_KERNEL_STUBS` that + is not already importable, then removes them on exit. A no-op for packages + that are genuinely installed (unless ``force`` / the force env var is set). + """ + + if force is None: + force = os.environ.get(_FORCE_ENV) == "1" + + added: List[str] = [] + try: + for top, submodules in _KERNEL_STUBS.items(): + if top in sys.modules: + continue # already present (real or previously stubbed) + if not force and importlib.util.find_spec(top) is not None: + continue # real kernel installed -> use it, don't shadow + + # create every dotted prefix as a package, then the leaf submodules + packages_needed = set() + for full in submodules: + parts = full.split(".") + for i in range(1, len(parts)): + packages_needed.add(".".join(parts[:i])) + for pkg in sorted(packages_needed, key=lambda s: s.count(".")): + if pkg not in sys.modules: + # a pure-package prefix carries no members of its own unless + # it is also a declared leaf (handled below) + _register_module(pkg, is_package=True, members=[], added=added) + for full, members in submodules.items(): + is_pkg = full in packages_needed # leaf that is also a package prefix + if full in sys.modules: + # already created as a bare package prefix; attach members + for member in members: + setattr(sys.modules[full], member, _make_stub_member(f"{full}.{member}")) + else: + _register_module(full, is_package=is_pkg, members=members, added=added) + + # wire parent.child attributes so ``from a.b.c import x`` resolves + for name in list(sys.modules): + if name == top or name.startswith(top + "."): + if "." in name: + parent, child = name.rsplit(".", 1) + if parent in sys.modules: + setattr(sys.modules[parent], child, sys.modules[name]) + yield + finally: + for name in added: + sys.modules.pop(name, None) diff --git a/src/nnsight/modeling/transformers.py b/src/nnsight/modeling/transformers.py index 1cc2998a..0218fc1f 100755 --- a/src/nnsight/modeling/transformers.py +++ b/src/nnsight/modeling/transformers.py @@ -1,4 +1,5 @@ from .huggingface import HuggingFaceModel +from ._kernel_shim import meta_kernel_shim from torch.nn.modules import Module from transformers import AutoConfig, PreTrainedModel, PretrainedConfig @@ -52,6 +53,12 @@ def _load_config(self, repo_id: str, revision: Optional[str] = None, **kwargs): if self.config is None: + # Default to trusting remote code so the config class matches the + # remote modeling code used for meta/dispatch. Some remote configs + # understand fields the native class does not (e.g. newer Nemotron-H + # ``hybrid_override_pattern`` block types). + kwargs.setdefault("trust_remote_code", True) + self.__dict__["config"] = AutoConfig.from_pretrained( repo_id, revision=revision, **kwargs ) @@ -65,7 +72,20 @@ def _load_meta( self._load_config(repo_id, revision=revision, **kwargs) - model = self.automodel.from_config(self.config, trust_remote_code=True) + # Keep the meta implementation consistent with the dispatched one: both + # default to trusting remote code so the intervention tree the client + # builds matches the model that is actually loaded/served (e.g. the + # Nemotron-H remote code, whose per-expert layout differs from the native + # transformers class). + trust_remote_code = kwargs.get("trust_remote_code", True) + + # Some remote modeling files hard-import CUDA-only kernels (mamba_ssm, + # causal_conv1d) at module import time. A meta model never runs a forward, + # so satisfy those imports with inert stubs and keep the client GPU-free. + with meta_kernel_shim(): + model = self.automodel.from_config( + self.config, trust_remote_code=trust_remote_code + ) self.__dict__["config"] = model.config @@ -80,6 +100,10 @@ def _load( self._load_config(repo_id, revision=revision, **kwargs) + # Mirror the meta path's default so dispatch loads the same implementation + # the intervention tree was built against. + kwargs.setdefault("trust_remote_code", True) + model = self.automodel.from_pretrained(repo_id, revision=revision, **kwargs) self.__dict__["config"] = model.config From 56219a1e22f423d6520b062968af25e6391036b4 Mon Sep 17 00:00:00 2001 From: khaiwang Date: Thu, 11 Jun 2026 21:35:50 -0400 Subject: [PATCH 2/2] refactor(kernel-shim): trim to the minimal stub the current offenders need This shim is a local workaround for remote modeling code that unguardedly imports CUDA kernels at module level (Nemotron-H's rmsnorm_fn import); the real fix belongs upstream. Drop the speculative generality accordingly: - Remove the parent.child attribute wiring: the known files only use the `from a.b.c import x` form, which resolves via the IMPORT_FROM sys.modules fallback without it. A comment marks where to re-add wiring if a file ever uses dotted attribute access (`import mamba_ssm; mamba_ssm.ops...`). - Trim the stub table to the one unconditional import. The sibling kernel imports sit behind transformers availability guards that answer False when the package isn't installed, so their stubs were never consumed. - Fix a latent crash present since the original shim: on a CUDA-visible machine without the kernels, is_mamba_2_ssm_available() finds the stub via find_spec, finds no pip metadata, falls back to __version__, and crashed on version.parse("N/A"). Stamp __version__ = "0.0.0" on the stub so every version gate parses and correctly answers "not available". Verified GPU-less (Nemotron-3 4B + 120B + gpt2 meta-load, sys.modules clean after) and CUDA-visible without kernels (previously crashed, now loads). Co-Authored-By: Claude Fable 5 --- src/nnsight/modeling/_kernel_shim.py | 49 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/nnsight/modeling/_kernel_shim.py b/src/nnsight/modeling/_kernel_shim.py index 1ecfcbb6..4e7c0f50 100644 --- a/src/nnsight/modeling/_kernel_shim.py +++ b/src/nnsight/modeling/_kernel_shim.py @@ -20,6 +20,14 @@ The shim is a **no-op when the real kernel is installed** (e.g. on a GPU host), so it never shadows a genuine implementation. + +This is a deliberately narrow, local workaround — not a general compatibility +layer. The actual defect is in the remote modeling code: optional CUDA kernels +should be imported behind an availability guard (NVIDIA's adjacent kernel +imports already are; ``rmsnorm_fn`` is not). The shim fakes only as much of the +import machinery as the known offending files exercise; see the note at the end +of :func:`meta_kernel_shim` for what would break first if a new file imports +these packages differently. """ from __future__ import annotations @@ -33,20 +41,17 @@ from typing import Dict, List # top-level package -> {fully-qualified submodule it exposes: [members to stub]}. -# Only the members that remote modeling files import *unconditionally* strictly -# need stubbing; the rest are harmless and make the shim robust across the -# several Mamba-family remote files that copy this import block. +# Deliberately minimal: only the import the known remote files perform +# *unconditionally* (Nemotron-H's ``rmsnorm_fn``). Their sibling kernel imports +# (mamba_ssm selective_state_update / ssd_combined, causal_conv1d) are behind +# availability guards that stay False when the package isn't truly installed, +# so they never reach the stub. If a future remote file imports one of those +# unconditionally — or a transformers version starts answering its availability +# guard from find_spec alone (which the stub satisfies) — the meta load will +# fail loudly with ImportError/ModuleNotFoundError; add that entry here then. _KERNEL_STUBS: Dict[str, Dict[str, List[str]]] = { "mamba_ssm": { "mamba_ssm.ops.triton.layernorm_gated": ["rmsnorm_fn"], - "mamba_ssm.ops.triton.selective_state_update": ["selective_state_update"], - "mamba_ssm.ops.triton.ssd_combined": [ - "mamba_chunk_scan_combined", - "mamba_split_conv1d_scan_combined", - ], - }, - "causal_conv1d": { - "causal_conv1d": ["causal_conv1d_fn", "causal_conv1d_update"], }, } @@ -123,13 +128,21 @@ def meta_kernel_shim(force: bool | None = None): else: _register_module(full, is_package=is_pkg, members=members, added=added) - # wire parent.child attributes so ``from a.b.c import x`` resolves - for name in list(sys.modules): - if name == top or name.startswith(top + "."): - if "." in name: - parent, child = name.rsplit(".", 1) - if parent in sys.modules: - setattr(sys.modules[parent], child, sys.modules[name]) + # transformers availability guards (is_mamba_2_ssm_available etc.) + # see the stub via find_spec, find no pip metadata, and fall back to + # parsing the package's __version__ — which must therefore exist and + # be parseable. "0.0.0" fails every minimum-version comparison, so + # all guards correctly answer "not available". + sys.modules[top].__version__ = "0.0.0" + + # NOTE: we do NOT set parent.child attributes (mamba_ssm.ops = + # etc.), which a real import would set as the final step of loading a + # submodule. The known offending files only use the + # ``from a.b.c import x`` form, which resolves via the IMPORT_FROM + # sys.modules fallback even without those attributes. A remote file + # that instead does ``import mamba_ssm`` and later dereferences + # ``mamba_ssm.ops...`` would AttributeError here — if that ever + # appears, wire the parent attributes at this point. yield finally: for name in added: