diff --git a/reproject/_array_utils.py b/reproject/_array_utils.py
index 26a6d2686..7691f573c 100644
--- a/reproject/_array_utils.py
+++ b/reproject/_array_utils.py
@@ -340,6 +340,16 @@ def aligned_chunks(lo, hi, edges=edges):
 
 
 class ArrayWrapper:
+    """
+    A minimal getitem-only wrapper hiding an array from dask's tokenizer.
+
+    Passing a Numpy array (in particular a memmap) directly to
+    ``da.from_array`` can make dask hash the whole buffer to compute the array
+    name, which silently loads the entire file into memory (see
+    https://github.com/dask/dask/issues/11850). Wrapping the array so that
+    dask can only access it through ``__getitem__``, combined with an explicit
+    ``name=``, guarantees the data is only ever read chunk by chunk.
+    """
 
     def __init__(self, array):
         self._array = array
diff --git a/reproject/_common.py b/reproject/_common.py
index f8a52b54c..6530c3016 100644
--- a/reproject/_common.py
+++ b/reproject/_common.py
@@ -1,4 +1,5 @@
 import logging
+import mmap
 import os
 import tempfile
 import uuid
@@ -101,9 +102,10 @@ def _reproject_dispatcher(
         given as a tuple of sequential integers starting from zero (e.g.
         ``(0,)`` or ``(0, 1)``). If `None` (the default), any leading dimensions
         for which the WCS has fewer dimensions than the data are treated this
-        way. Reprojecting fewer dimensions than the WCS currently requires a
-        ``block_size`` that matches the output shape along the reprojected
-        dimensions.
+        way. Reprojecting fewer dimensions than the WCS currently requires an
+        explicit ``block_size``; its entries along the reprojected dimensions
+        may either match the output shape or be smaller, in which case each
+        non-reprojected slice is reprojected in sub-tiles of that size.
     array_out : `~numpy.ndarray`, optional
         An array in which to store the reprojected data.  This can be any numpy
         array including a memory map, which may be helpful when dealing with
@@ -184,15 +186,15 @@ def _reproject_dispatcher(
                 "non_reprojected_dims should leave at least one dimension to be " "reprojected"
             )
 
-    # If we are reprojecting fewer dimensions than the input or output WCS has,
-    # the WCS needs to be sliced down to the reprojected dimensions for each
-    # non-reprojected slice. This is currently only done when parallelizing over
-    # the non-reprojected (broadcasted) dimensions, so any other code path would
-    # silently reproject the dimensions that should have been left untouched.
-    # This is gated on non_reprojected_dims being set since that is the only way
-    # to opt into reprojecting fewer dimensions than the WCS; a plain mismatch
-    # between the input and output WCS dimensionality is instead a validation
-    # error raised by the underlying reprojection function.
+    # ``wcs_slicing_required`` flags that we are reprojecting fewer dimensions than
+    # the input or output WCS describes, so the WCS must be sliced down to the
+    # reprojected dimensions for each non-reprojected slice. That slicing is only
+    # implemented on the path that parallelizes over the non-reprojected
+    # (broadcasted) dimensions; the other code paths raise NotImplementedError below
+    # rather than attempting it. It is gated on non_reprojected_dims being set, the
+    # only way to opt into reprojecting fewer dimensions than the WCS; a plain
+    # mismatch between the input and output WCS dimensionality is instead a
+    # validation error raised by the underlying reprojection function.
     wcs_slicing_required = non_reprojected_dims is not None and (
         n_dim_reproject < wcs_in.low_level_wcs.pixel_n_dim
         or n_dim_reproject < wcs_out.low_level_wcs.pixel_n_dim
@@ -305,28 +307,59 @@ def _reproject_dispatcher(
                 for i in range(len(block_size))
             )
 
-        # Check block size and determine whether block size indicates we should
-        # parallelize over broadcasted dimension. The logic is as follows: if
-        # the block size and output shape are the same size, then either the
-        # block size should match the output shape along the broadcasted
-        # dimensions or along the non-broadcasted dimensions. If it matches the
-        # non-broadcasted dimensions we can parallelize over the broadcasted
-        # dimensions. If the block size does not match the output shape, we
-        # don't make any assumptions for now and assume a single chunk in the
-        # missing dimensions.
+        # Decide whether the requested block size means we should parallelize over
+        # the broadcasted (non-reprojected, leading) dimensions. block_size has
+        # already been padded above to one entry per output dimension, so this is not
+        # about the number of entries but about which dimensions the block spans the
+        # full output extent along:
+        #  - if the block spans the full extent along the reprojected (trailing)
+        #    dimensions, each block is one whole non-reprojected slice, so we
+        #    parallelize over the broadcasted dimensions (one slice per block);
+        #  - if instead it spans the full extent along the broadcasted (leading)
+        #    dimensions, the block tiles the reprojected dimensions and we do not
+        #    parallelize over the broadcasted dimensions;
+        #  - if it spans the full extent along neither, we raise, unless
+        #    non_reprojected_dims requires slicing the WCS per non-reprojected slice,
+        #    in which case a block smaller than the slice sub-tiles each slice.
         broadcasted_parallelization = False
         if broadcasting and block_size is not None and block_size != "auto":
             if block_size[-n_dim_reproject:] == shape_out[-n_dim_reproject:]:
-                # TODO: maybe error if block_size was given in full and is wrong
                 broadcasted_parallelization = True
-                block_size = (1,) * (len(shape_out) - n_dim_reproject) + block_size[
-                    -n_dim_reproject:
-                ]
+            elif wcs_slicing_required:
+                # A block smaller than the output along the reprojected dimensions
+                # is only meaningful when the WCS has to be sliced per broadcasted
+                # slice (i.e. non_reprojected_dims). We parallelize one broadcasted
+                # slice per block and let dask additionally tile the reprojected
+                # dimensions according to the block size, which bounds the
+                # coordinate-transform memory (it would otherwise scale with the
+                # full slice size). Each output tile is still reprojected from the
+                # whole input slice, since any output pixel can map anywhere within
+                # it.
+                broadcasted_parallelization = True
             elif block_size[:-n_dim_reproject] != shape_out[:-n_dim_reproject]:
                 raise ValueError(
                     "block shape should either match output data shape along "
                     "reprojected dimensions or non-reprojected dimensions"
                 )
+            if broadcasted_parallelization:
+                # One broadcasted slice per block; dask tiles the reprojected
+                # dimensions using whatever block size was requested along them.
+                # The block size along the non-reprojected dimensions must be 1
+                # or span the full extent (equivalent here, since blocks are
+                # single slices either way); anything else would be silently
+                # reinterpreted, so raise instead.
+                if any(
+                    entry not in (1, shape_out[idim])
+                    for idim, entry in enumerate(block_size[: len(shape_out) - n_dim_reproject])
+                ):
+                    raise ValueError(
+                        f"block_size {block_size} should be 1 or match the output shape "
+                        "along the non-reprojected dimensions (each block covers a "
+                        "single non-reprojected slice)"
+                    )
+                block_size = (1,) * (len(shape_out) - n_dim_reproject) + block_size[
+                    -n_dim_reproject:
+                ]
 
         logger.info(
             f"{'P' if broadcasted_parallelization else 'Not p'}arallelizing along "
@@ -341,9 +374,11 @@ def _reproject_dispatcher(
             raise NotImplementedError(
                 "Reprojecting fewer dimensions than the input or output WCS "
                 "(for example using non_reprojected_dims) currently requires "
-                "passing a block_size whose entries along the reprojected "
-                "dimensions match the output shape (optionally with parallel=True "
-                "to compute the blocks concurrently)"
+                "passing an explicit block_size whose entries along the reprojected "
+                "dimensions either match the output shape or are smaller (in which "
+                "case each non-reprojected slice is reprojected in sub-tiles of "
+                "that size), "
+                "optionally with parallel=True to compute the blocks concurrently"
             )
 
         if output_footprint is None and return_footprint and return_type != "dask":
@@ -359,8 +394,8 @@ def reproject_single_block(a, array_or_path, block_info=None):
             ):
                 return np.array([a, a])
 
-            if isinstance(array_or_path, str) and array_or_path == "from-dict":
-                array_or_path = dask_arrays["array"]
+            if isinstance(array_or_path, _ArrayContainer):
+                array_or_path = array_or_path._array
 
             shape_out = block_info[None]["chunk-shape"][1:]
 
@@ -375,8 +410,16 @@ def reproject_single_block(a, array_or_path, block_info=None):
             wcs_in_cp = wcs_in.deepcopy() if isinstance(wcs_in, WCS) else wcs_in
             wcs_out_cp = wcs_out.deepcopy() if isinstance(wcs_out, WCS) else wcs_out
 
+            # Along the reprojected dimensions the input is always kept whole (any
+            # output pixel can map anywhere within it) while dask may tile the
+            # output; along the broadcasted dimensions each block is a single
+            # slice. slices_in/slices_out reduce the input/output WCS to this
+            # block; the matching broadcasted slice of the input either arrives as
+            # the aligned input block or, when the input was passed whole (lazy
+            # dask input), is read out below using slices_in_data.
             slices_in = []
             slices_out = []
+            slices_in_data = []
             for idx in range(len(shape_out)):
                 interval = block_info[None]["array-location"][idx + 1]
                 if broadcasted_parallelization and idx < len(shape_out) - n_dim_reproject:
@@ -387,9 +430,11 @@ def reproject_single_block(a, array_or_path, block_info=None):
                         )
                     slices_in.append(interval[0])
                     slices_out.append(interval[0])
+                    slices_in_data.append(slice(*interval))
                 else:
                     slices_in.append(slice(None))
                     slices_out.append(slice(*block_info[None]["array-location"][idx + 1]))
+                    slices_in_data.append(slice(None))
 
             slices_in = slices_in[-wcs_in.low_level_wcs.pixel_n_dim :]
             slices_out = slices_out[-wcs_out.low_level_wcs.pixel_n_dim :]
@@ -411,16 +456,27 @@ def reproject_single_block(a, array_or_path, block_info=None):
 
             wcs_out_sub = HighLevelWCSWrapper(low_level_wcs_out)
 
-            if isinstance(array_or_path, tuple):
+            if broadcasted_parallelization and input_aligned:
+                # The input was passed as an aligned dask array, so array_or_path
+                # is already this block's broadcasted slice of the input, kept
+                # whole along the reprojected dimensions (see above).
+                array_in = array_or_path
+            elif isinstance(array_or_path, tuple):
                 array_in = np.memmap(array_or_path[0], **array_or_path[1], mode="r")
             elif isinstance(array_or_path, str):
                 array_in = np.memmap(array_or_path, dtype=float, shape=shape_in, mode="r")
             else:
                 array_in = array_or_path
 
-            if array_or_path is None:
+            if array_in is None:
                 raise RuntimeError("array_or_path is not set")
 
+            if broadcasted_parallelization and not input_aligned:
+                # The input was passed whole as a lazy dask array; read out a lazy
+                # view of this block's broadcasted slice so a streaming
+                # reprojection core only computes the input chunks it touches.
+                array_in = array_in[tuple(slices_in_data)]
+
             array, footprint = reproject_func(
                 array_in,
                 wcs_in_sub,
@@ -432,98 +488,140 @@ def reproject_single_block(a, array_or_path, block_info=None):
 
             return np.array([array, footprint])
 
-        if broadcasted_parallelization:
-
-            array_out_dask = da.empty(shape_out, chunks=block_size)
-
-            # The input is reprojected in full for each output block, so it must
-            # not be chunked along the reprojected dimensions (which can have a
-            # different size from the output); only the broadcasted dimensions are
-            # chunked, matching array_out_dask block for block.
+        input_aligned = False
+        if broadcasted_parallelization and (
+            not isinstance(array_in, da.core.Array)
+            or dask_method != "none"
+            or all(len(chunks) == 1 for chunks in array_in.chunks[-n_dim_reproject:])
+        ):
+            # Pass the input as a second dask array with one chunk per broadcasted
+            # slice, kept whole along the reprojected dimensions (any output pixel
+            # can map anywhere within its slice). map_blocks broadcasts the single
+            # chunk along the reprojected dimensions to every output tile of that
+            # slice, so each slice is computed exactly once and streamed to the
+            # tasks that need it: dask array inputs are never materialized in
+            # full, sub-tiled slices do not recompute their input per tile, and
+            # under a distributed scheduler each task depends only on its own
+            # slice rather than embedding the whole input. The exception is a dask
+            # input with dask_method='none' that is chunked below one slice along
+            # the reprojected dimensions: materializing it here would forgo the
+            # ability of streaming reprojection cores to work chunk by chunk
+            # without ever holding a whole slice, so it is kept lazy below.
+            input_aligned = True
             input_chunks = (1,) * (array_in.ndim - n_dim_reproject) + (-1,) * n_dim_reproject
             if isinstance(array_in, da.core.Array):
-                array_in = array_in.rechunk(input_chunks)
+                array_in_dask = array_in.rechunk(input_chunks)
+                # Blockwise fusion would fold the input graph into every output
+                # tile task, recomputing each broadcasted slice once per tile of
+                # that slice; routing each slice through a delayed task pins it
+                # as a single node in the graph that all of its tiles share.
+                delayed_blocks = array_in_dask.to_delayed()
+                pieces = np.empty(delayed_blocks.shape, dtype=object)
+                for index in np.ndindex(delayed_blocks.shape):
+                    shape = tuple(
+                        array_in_dask.chunks[idim][index[idim]]
+                        for idim in range(array_in_dask.ndim)
+                    )
+                    pieces[index] = da.from_delayed(
+                        delayed_blocks[index], shape=shape, dtype=array_in_dask.dtype
+                    )
+                array_in_or_path = da.block(pieces.tolist())
             else:
-                array_in = da.asarray(
-                    ArrayWrapper(array_in), name=str(uuid.uuid4()), chunks=input_chunks
+                # ArrayWrapper (plus the explicit name) prevents dask from
+                # hashing the whole buffer to name the array, which for a memmap
+                # would silently load the entire file into memory (see
+                # https://github.com/dask/dask/issues/11850).
+                array_in_or_path = da.from_array(
+                    ArrayWrapper(array_in),
+                    name=f"reproject-input-{uuid.uuid4().hex}",
+                    chunks=input_chunks,
                 )
 
-            result = da.map_blocks(
-                reproject_single_block,
-                array_out_dask,
-                array_in,
-                dtype="<f8",
-                new_axis=0,
-                chunks=((2,),) + array_out_dask.chunks,
-            )
-
-        else:
-
-            # As we use the synchronous or threads scheduler, we don't need to worry about
-            # the data getting copied, so if the data is already a Numpy array (including
-            # a memory-mapped array) then we don't need to do anything special. However,
-            # if the input array is a dask array, we should convert it to a Numpy
-            # memory-mapped array so that it can be used by the various reprojection
-            # functions (which don't internally work with dask arrays).
-
-            if isinstance(array_in, np.memmap) and array_in.flags.c_contiguous:
-                array_in_or_path = array_in.filename, {
-                    "dtype": array_in.dtype,
-                    "shape": array_in.shape,
-                    "offset": array_in.offset,
-                }
-            elif isinstance(array_in, da.core.Array) or return_type == "dask":
-                if dask_method == "memmap":
-                    if return_type == "dask":
-                        # We should use a temporary directory that will persist beyond
-                        # the call to the reproject function.
-                        tmp_dir = tempfile.mkdtemp()
-                    else:
-                        tmp_dir = local_tmp_dir
-                    array_in_or_path = as_delayed_memmap_path(_ArrayContainer(array_in), tmp_dir)
+        elif broadcasted_parallelization:
+            # A dask input with dask_method='none' chunked below one slice along
+            # the reprojected dimensions: pass it whole as an opaque constant and
+            # let each block read out a lazy view of its own slice, so that a
+            # streaming reprojection core (e.g. interpolation via dask-image) only
+            # ever computes the input chunks that each output tile touches and a
+            # full slice need never be materialized at once. The tradeoff is that
+            # input chunks touched by several tiles are computed once per tile.
+            array_in_or_path = _ArrayContainer(array_in)
+
+        # For the remaining (non-broadcasted) cases the input is passed to
+        # map_blocks as an opaque (non-dask) argument, so that every task sees the
+        # whole input. As we use the synchronous or threads scheduler, we don't need
+        # to worry about the data getting copied, so if the data is already a Numpy
+        # array (including a memory-mapped array) then we don't need to do anything
+        # special. However, if the input array is a dask array, we should convert
+        # it to a Numpy memory-mapped array so that it can be used by the various
+        # reprojection functions (which don't internally work with dask arrays).
+
+        # Only base memmaps can be reconstructed from filename and offset: views
+        # (e.g. a slice of a memmap) keep the parent's unadjusted .offset, so
+        # reconstructing them would silently read the wrong file region. Views
+        # fall through and are passed by reference like plain arrays.
+        elif (
+            isinstance(array_in, np.memmap)
+            and array_in.flags.c_contiguous
+            and isinstance(array_in.base, mmap.mmap)
+        ):
+            array_in_or_path = array_in.filename, {
+                "dtype": array_in.dtype,
+                "shape": array_in.shape,
+                "offset": array_in.offset,
+            }
+        elif isinstance(array_in, da.core.Array) or return_type == "dask":
+            if dask_method == "memmap":
+                if return_type == "dask":
+                    # We should use a temporary directory that will persist beyond
+                    # the call to the reproject function.
+                    tmp_dir = tempfile.mkdtemp()
                 else:
-                    dask_arrays = {"array": array_in}
-                    array_in_or_path = "from-dict"
+                    tmp_dir = local_tmp_dir
+                array_in_or_path = as_delayed_memmap_path(_ArrayContainer(array_in), tmp_dir)
             else:
-                # Here we could set array_in_or_path to array_in_path if it has
-                # been set previously, but in synchronous and threaded mode it is
-                # better to simply pass a reference to the memmap array itself to
-                # avoid having to load the memmap inside each
-                # reproject_single_block call.
-                array_in_or_path = array_in
-
-            if block_size is not None and block_size != "auto":
-                array_out_dask = da.empty(shape_out, chunks=block_size)
-            else:
-                if broadcasting:
-                    chunks = (-1,) * (len(shape_out) - n_dim_reproject)
-                    chunks += ("auto",) * n_dim_reproject
-                    rechunk_kwargs = {"chunks": chunks}
-                else:
-                    rechunk_kwargs = {}
-                array_out_dask = da.empty(shape_out)
-                array_out_dask = array_out_dask.rechunk(
-                    block_size_limit=64 * 1024**2, **rechunk_kwargs
-                )
-
-            logger.info("Setting up output dask array with map_blocks")
+                # Wrap the dask array in _ArrayContainer so dask treats it as an
+                # opaque constant (rather than a collection to compute/align) when
+                # it is passed through to the block function.
+                array_in_or_path = _ArrayContainer(array_in)
+        else:
+            # Here we could set array_in_or_path to array_in_path if it has
+            # been set previously, but in synchronous and threaded mode it is
+            # better to simply pass a reference to the memmap array itself to
+            # avoid having to load the memmap inside each
+            # reproject_single_block call.
+            array_in_or_path = array_in
 
-            result = da.map_blocks(
-                reproject_single_block,
-                array_out_dask,
-                array_in_or_path,
-                dtype="<f8",
-                new_axis=0,
-                chunks=(2,) + array_out_dask.chunksize,
-            )
+        if block_size is not None and block_size != "auto":
+            array_out_dask = da.empty(shape_out, chunks=block_size)
+        else:
+            if broadcasting:
+                chunks = (-1,) * (len(shape_out) - n_dim_reproject)
+                chunks += ("auto",) * n_dim_reproject
+                rechunk_kwargs = {"chunks": chunks}
+            else:
+                rechunk_kwargs = {}
+            array_out_dask = da.empty(shape_out)
+            array_out_dask = array_out_dask.rechunk(block_size_limit=64 * 1024**2, **rechunk_kwargs)
+
+        logger.info("Setting up output dask array with map_blocks")
+
+        # Declare the exact (possibly ragged) chunks of the output template so
+        # that edge blocks are computed at their true size rather than being
+        # reprojected at the full block size and truncated afterwards.
+        result = da.map_blocks(
+            reproject_single_block,
+            array_out_dask,
+            array_in_or_path,
+            dtype="<f8",
+            new_axis=0,
+            chunks=((2,),) + array_out_dask.chunks,
+        )
 
         # Ensure that there are no more references to Numpy memmaps
         array_in = None
         array_in_or_path = None
 
-        # Truncate extra elements
-        result = result[tuple([slice(None)] + [slice(s) for s in shape_out])]
-
         if return_type == "dask":
             if return_footprint:
                 return result[0], result[1]
diff --git a/reproject/adaptive/_high_level.py b/reproject/adaptive/_high_level.py
index 6d9688fa7..961a001d2 100644
--- a/reproject/adaptive/_high_level.py
+++ b/reproject/adaptive/_high_level.py
@@ -213,9 +213,12 @@ def reproject_adaptive(
         even when the input and output WCS have the same number of dimensions as
         the data. The dimensions must be the leading ones, given as a tuple of
         sequential integers starting from zero (e.g. ``(0,)`` or ``(0, 1)``).
-        This currently requires passing a ``block_size`` whose entries along
-        the reprojected dimensions match ``shape_out`` (optionally combined
-        with ``parallel`` to compute the blocks concurrently).
+        This currently requires passing an explicit ``block_size``; its entries
+        along the reprojected dimensions may either match ``shape_out`` or be
+        smaller, in which case each non-reprojected slice is reprojected in
+        sub-tiles of that size to keep the coordinate-transform memory bounded
+        (optionally
+        combined with ``parallel`` to compute the blocks concurrently).
     parallel : bool or int or str, optional
         If `True`, the reprojection is carried out in parallel, and if a
         positive integer, this specifies the number of threads to use.
diff --git a/reproject/interpolation/_high_level.py b/reproject/interpolation/_high_level.py
index 876b63605..4fbef5aaf 100644
--- a/reproject/interpolation/_high_level.py
+++ b/reproject/interpolation/_high_level.py
@@ -109,9 +109,12 @@ def reproject_interp(
         even when the input and output WCS have the same number of dimensions as
         the data. The dimensions must be the leading ones, given as a tuple of
         sequential integers starting from zero (e.g. ``(0,)`` or ``(0, 1)``).
-        This currently requires passing a ``block_size`` whose entries along
-        the reprojected dimensions match ``shape_out`` (optionally combined
-        with ``parallel`` to compute the blocks concurrently).
+        This currently requires passing an explicit ``block_size``; its entries
+        along the reprojected dimensions may either match ``shape_out`` or be
+        smaller, in which case each non-reprojected slice is reprojected in
+        sub-tiles of that size to keep the coordinate-transform memory bounded
+        (optionally
+        combined with ``parallel`` to compute the blocks concurrently).
     parallel : bool or int or str, optional
         If `True`, the reprojection is carried out in parallel, and if a
         positive integer, this specifies the number of threads to use.
diff --git a/reproject/tests/test_non_reprojected_dims.py b/reproject/tests/test_non_reprojected_dims.py
index 81d17f15d..5b847227f 100644
--- a/reproject/tests/test_non_reprojected_dims.py
+++ b/reproject/tests/test_non_reprojected_dims.py
@@ -57,6 +57,184 @@ def test_non_reprojected_dims(reproject_function):
     assert_allclose(array_out, reference, equal_nan=True)
 
 
+@pytest.mark.parametrize("block_size", [(1, 7, 7), (7, 7), (1, 12, 20)])
+def test_non_reprojected_dims_subtiled(reproject_function, block_size):
+    # A block_size smaller than the output along the reprojected dimensions
+    # (the celestial ones here) should reproject each slice in sub-tiles and give
+    # exactly the same result as reprojecting each full slice in one go. This is
+    # what keeps the coordinate-transform memory bounded for large slices.
+
+    data = np.arange(4 * 20 * 20, dtype=float).reshape((4, 20, 20))
+    wcs_in = _spectral_cube_wcs(0.0, 1e9)
+    wcs_out = _spectral_cube_wcs(0.02, 1e9 + 2e6)
+    shape_out = (4, 20, 20)
+
+    array_full, footprint_full = reproject_function(
+        (data, wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(20, 20),
+    )
+
+    array_sub, footprint_sub = reproject_function(
+        (data, wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=block_size,
+    )
+
+    assert_allclose(array_sub, array_full, equal_nan=True)
+    assert_allclose(footprint_sub, footprint_full, equal_nan=True)
+
+
+@pytest.mark.parametrize("chunks", [(1, 30, 30), (1, 15, 15)])
+@pytest.mark.parametrize("block_size", [(20, 20), (7, 7)])
+def test_non_reprojected_dims_dask_input(reproject_function, block_size, chunks):
+    # A dask-array input must match the identical numpy input, both for
+    # full-plane and sub-tiled blocks. With dask_method='none', an input chunked
+    # one slice at a time is materialized per slice (exactly once), while an
+    # input chunked below one slice is kept lazy so streaming cores never need a
+    # whole slice at once; both must give the same answer. The WCS drifts along
+    # the non-reprojected axis so each slice really is reprojected with its own
+    # WCS.
+    import dask.array as da
+
+    n_time = 5
+    shape_out = (n_time, 30, 30)
+    wcs_in = _drifting_cube_wcs(drift=0.6)
+    wcs_out = _drifting_cube_wcs(drift=0.0)
+
+    data = np.random.default_rng(0).random((n_time, 30, 30))
+
+    reference, _ = reproject_function(
+        (data, wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(30, 30),
+    )
+
+    array_out, _ = reproject_function(
+        (da.from_array(data, chunks=chunks), wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=block_size,
+        dask_method="none",
+    )
+
+    assert_allclose(array_out, reference, equal_nan=True)
+
+
+def test_non_reprojected_dims_sliced_memmap(tmp_path, reproject_function):
+    # A sliced memmap view keeps the parent's unadjusted .offset, so it must not
+    # be reconstructed from filename and offset inside the block tasks (which
+    # would silently reproject the wrong planes); views are passed by reference
+    # instead. Slicing off the leading plane keeps the view c-contiguous, which
+    # is the case that used to take the reconstruction path.
+
+    data = np.arange(5 * 20 * 20, dtype=float).reshape((5, 20, 20))
+    mm = np.memmap(tmp_path / "cube.np", mode="w+", dtype=float, shape=(5, 20, 20))
+    mm[:] = data
+    mm.flush()
+
+    wcs_in = _spectral_cube_wcs(0.0, 1e9)
+    wcs_out = _spectral_cube_wcs(0.02, 1e9 + 2e6)
+    shape_out = (4, 20, 20)
+
+    reference, _ = reproject_function(
+        (data[1:], wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(20, 20),
+    )
+
+    array_out, _ = reproject_function(
+        (mm[1:], wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(20, 20),
+    )
+
+    assert_allclose(array_out, reference, equal_nan=True)
+
+
+def test_non_reprojected_dims_dask_input_streams_planes(reproject_function):
+    # The input is passed as a dask array with one chunk per non-reprojected
+    # slice, so each input plane must be computed exactly once, including when
+    # the output is sub-tiled (every tile of a plane shares that plane's chunk
+    # rather than recomputing it), and the whole input must never be
+    # materialized at once.
+    import dask.array as da
+
+    n_time = 5
+    shape_out = (n_time, 30, 30)
+    wcs_in = _drifting_cube_wcs(drift=0.6)
+    wcs_out = _drifting_cube_wcs(drift=0.0)
+
+    data = np.random.default_rng(0).random((n_time, 30, 30))
+
+    computed_planes = []
+
+    def record_plane(plane, block_info=None):
+        if block_info:
+            computed_planes.append(block_info[None]["chunk-location"][0])
+        return plane
+
+    lazy = da.from_array(data, chunks=(1, 30, 30)).map_blocks(record_plane)
+
+    array_out, _ = reproject_function(
+        (lazy, wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(7, 7),
+        dask_method="none",
+    )
+
+    reference, _ = reproject_function(
+        (data, wcs_in),
+        wcs_out,
+        shape_out=shape_out,
+        non_reprojected_dims=(0,),
+        parallel=True,
+        block_size=(30, 30),
+    )
+
+    assert_allclose(array_out, reference, equal_nan=True)
+    assert sorted(computed_planes) == list(range(n_time))
+
+
+def test_non_reprojected_dims_invalid_leading_block_size(reproject_function):
+    # Since each block covers a single non-reprojected slice, block_size entries
+    # along the non-reprojected dimensions must be 1 or the full extent; other
+    # values would be silently reinterpreted as 1 so they raise instead.
+    data = np.ones((4, 20, 20))
+    wcs_in = _spectral_cube_wcs(0.0, 1e9)
+    wcs_out = _spectral_cube_wcs(0.02, 1e9 + 2e6)
+    for block_size in [(2, 7, 7), (999, 7, 7), (2, 20, 20)]:
+        with pytest.raises(ValueError, match="single non-reprojected slice"):
+            reproject_function(
+                (data, wcs_in),
+                wcs_out,
+                shape_out=(4, 20, 20),
+                non_reprojected_dims=(0,),
+                parallel=True,
+                block_size=block_size,
+            )
+
+
 def test_non_reprojected_dims_invalid_order(reproject_function):
     data = np.ones((4, 20, 20))
     wcs = _spectral_cube_wcs(0.0, 1e9)
@@ -82,12 +260,13 @@ def test_non_reprojected_dims_inconsistent_with_wcs(reproject_function):
 
 
 @pytest.mark.parametrize(
-    "kwargs", [{}, {"parallel": True}, {"parallel": True, "block_size": (4, 10, 10)}]
+    "kwargs", [{}, {"parallel": True}, {"parallel": True, "block_size": "auto"}]
 )
 def test_non_reprojected_dims_unsupported_mode(reproject_function, kwargs):
     # non_reprojected_dims with a full-dimensional WCS is only supported when
-    # parallelizing over the non-reprojected dimensions; other modes should
-    # raise rather than silently reprojecting the non-reprojected axis.
+    # parallelizing over the non-reprojected dimensions, which requires an
+    # explicit block_size; modes without one (including block_size='auto')
+    # should raise rather than silently reprojecting the non-reprojected axis.
     data = np.ones((4, 20, 20))
     wcs_in = _spectral_cube_wcs(0.0, 1e9)
     wcs_out = _spectral_cube_wcs(0.02, 1e9 + 2e6)