Source code for mumott.core.john_transform_sparse_cuda

from typing import Any

from math import floor
from numba import config, cuda, int32, float32, float64, void
from numba.types import Array
import numpy as np


def _is_cuda_array(item: Any):
    """ Internal method which assists in debugging. """
    if config.ENABLE_CUDASIM:
        return False
    else:
        return cuda.is_cuda_array(item)


[docs]def john_transform_sparse_cuda(field: np.ndarray[float], projections: np.ndarray[float], sparse_matrix: tuple,
                               unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
                               unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
                               offsets_k: np.ndarray[float]):
    """ Frontend for performing the John transform with parallel
    GPU computing, with a sparse matrix for projection-to-field mapping.

    Parameters
    ----------
    field
        The field to be projected, with 4 dimensions. The last index should
        have the same size as the number of tensor channels. Can be either
        a ``numpy.ndarray``, or a device array that implements the CUDA array
        interface. If a device array is given, no copying to device is needed.
    projections
        A 4-dimensional numpy array where the projections are stored. Should have the
        same shape as the input data matrix.
        The first index runs over the different projection directions. Can be either
        a ``numpy.ndarray``, or a device array that implements the CUDA array
        interface. If a device array is given, no copying or synchronization is needed.
    sparse_matrix
        Contains three ``np.ndarrays``, ``indices``, ``indptr``, and ``data``.
        ``indices[indptr[i, j]]`` to ``indices[indptr[i, j+1]]`` where ``i``
        is the projection channel index and ``j`` is the projection channel index
        contains the corresponding volume channel indices.
        ``data[indptr[i, j]]`` contains the weights for each index.
    unit_vector_p
        The direction of projection in Cartesian coordinates.
    unit_vector_j
        One of the directions for the pixels of :attr:`projections`.
    unit_vector_k
        The other direction for the pixels of :attr:`projections`.
    offsets_j
        Offsets which align projections in the direction of `j`
    offsets_k
        Offsets which align projections in the direction of `k`


    Notes
    -----
    For mathematical details, see :func:`~.john_transform.john_transform`.

    This transform also encodes the detector-to-tensor transform using a sparse matrix.
    """
    # Always false for now as we are sticking to ``float32``.
    if False:
        cuda_float = float64
        numpy_float = np.float64
        cuda_x4_type = cuda.float64x4
    else:
        cuda_float = float32
        numpy_float = np.float32
        cuda_x4_type = cuda.float32x4

    # Find zeroth, first and second directions for indexing. Zeroth is the maximal projection direction.

    # (x, y, z), (y, x, z), (z, x, y)
    direction_0_index = np.argmax(abs(unit_vector_p), axis=1).reshape(-1, 1).astype(np.int32)
    direction_1_index = 1 * (direction_0_index == 0).astype(np.int32).reshape(-1, 1).astype(np.int32)
    direction_2_index = (2 - (direction_0_index == 2)).astype(np.int32).reshape(-1, 1).astype(np.int32)

    # Step size for direction 1 and 2.
    step_sizes_1 = (np.take_along_axis(unit_vector_p, direction_1_index, 1) /
                    np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
    step_sizes_2 = (np.take_along_axis(unit_vector_p, direction_2_index, 1) /
                    np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()

    # Shape in each of the three directions.
    dimensions_0 = np.array(field.shape, dtype=numpy_float)[direction_0_index.ravel()]
    dimensions_1 = np.array(field.shape, dtype=numpy_float)[direction_1_index.ravel()]
    dimensions_2 = np.array(field.shape, dtype=numpy_float)[direction_2_index.ravel()]

    # Correction factor for length of line when taking a one-slice step.
    distance_multipliers = np.sqrt(1.0 + step_sizes_1 ** 2 + step_sizes_2 ** 2).astype(numpy_float)

    max_index = projections.shape[0]
    max_j = projections.shape[1]
    max_k = projections.shape[2]

    # CUDA chunking and memory size constants.
    projection_channels = int(projections.shape[-1])
    threads_j = int(8)
    threads_k = int(8)
    threads_angle = int(4)
    # Indices to navigate each projection. s is the surface positioning.
    k_vectors = unit_vector_k.astype(numpy_float)
    j_vectors = unit_vector_j.astype(numpy_float)
    s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
                 unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)

    # (0, 1, 2) if x main, (1, 0, 2) if y main, (2, 0, 1) if z main.
    direction_indices = np.stack((direction_0_index.ravel(),
                                  direction_1_index.ravel(),
                                  direction_2_index.ravel()), axis=1)

    # Bilinear interpolation over each slice.

    sparse_pointers_all = sparse_matrix[0].astype(np.int32)
    sparse_indices_all = sparse_matrix[1].astype(np.int32)
    sparse_weights_all = sparse_matrix[2].astype(numpy_float)
    sparse_ranges_all = np.diff(sparse_pointers_all, axis=-1).astype(np.int32)

    @cuda.jit(void(Array(float32, 4, 'C', readonly=True), int32, cuda_float,
              cuda_float, cuda_float, int32[::1], cuda_float[::1],
              Array(int32, 1, 'C', readonly=True),
              Array(float32, 1, 'C', readonly=True),
              Array(int32, 1, 'C', readonly=True),
              Array(int32, 1, 'C', readonly=True)), device=True,
              lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation(field: np.ndarray[float],
                               direction_0,
                               r0: float,
                               r1: float,
                               r2: float,
                               dimensions,
                               accumulator: np.ndarray[float],
                               sparse_pointers: np.ndarray[int],
                               sparse_weights: np.ndarray[float],
                               sparse_indices: np.ndarray[int],
                               sparse_ranges: np.ndarray[int]):
        """ Kernel for bilinear interpolation. Replaces texture interpolation."""
        if not (-1 <= r1 < dimensions[1] and -1 <= r2 < dimensions[2]):
            return
        # At edges, interpolate between value and 0.
        if (0 <= r1 < dimensions[1] - 1):
            r1_weight = int32(1)
        else:
            r1_weight = int32(0)

        if (0 <= r2 < dimensions[2] - 1):
            r2_weight = int32(1)
        else:
            r2_weight = int32(0)

        if int32(r1) == -1:
            r1_edge_weight = int32(1)
        else:
            r1_edge_weight = int32(0)

        if int32(r2) == -1:
            r2_edge_weight = int32(1)
        else:
            r2_edge_weight = int32(0)

        weight_1 = cuda_float((r1 - floor(r1)) * r1_weight) * (1 - r1_edge_weight)
        weight_2 = cuda_float((r2 - floor(r2)) * r2_weight) * (1 - r2_edge_weight)
        t = cuda_x4_type((1 - weight_1) * (1 - weight_2),
                         (weight_1 * weight_2),
                         ((1 - weight_1) * weight_2),
                         ((1 - weight_2) * weight_1))
        # Branch should be abstracted away by compiler, but could be done with pointer arithmetic.
        if (direction_0 == 0):
            x = int32(floor(r0))
            y = int32(floor(r1) + r1_edge_weight)
            y2 = y + r1_weight
            z = int32(floor(r2) + r2_edge_weight)
            z2 = z + r2_weight
            for i in range(accumulator.size):
                pointer = sparse_pointers[i]
                for j in range(sparse_ranges[i]):
                    index = sparse_indices[pointer + j]
                    weight = sparse_weights[pointer + j]
                    accumulator[i] += field[x, y, z, index] * t.x * weight
                    accumulator[i] += field[x, y2, z, index] * t.w * weight
                    accumulator[i] += field[x, y, z2, index] * t.z * weight
                    accumulator[i] += field[x, y2, z2, index] * t.y * weight

        elif (direction_0 == 1):
            x = int32(floor(r1) + r1_edge_weight)
            x2 = x + r1_weight
            y = int32(floor(r0))
            z = int32(floor(r2) + r2_edge_weight)
            z2 = z + r2_weight
            for i in range(accumulator.size):
                pointer = sparse_pointers[i]
                for j in range(sparse_ranges[i]):
                    index = sparse_indices[pointer + j]
                    weight = sparse_weights[pointer + j]
                    accumulator[i] += field[x, y, z, index] * t.x * weight
                    accumulator[i] += field[x2, y, z, index] * t.w * weight
                    accumulator[i] += field[x, y, z2, index] * t.z * weight
                    accumulator[i] += field[x2, y, z2, index] * t.y * weight

        elif (direction_0 == 2):
            x = int32(floor(r1) + r1_edge_weight)
            x2 = x + r1_weight
            y = int32(floor(r2) + r2_edge_weight)
            y2 = y + r2_weight
            z = int32(floor(r0))
            for i in range(accumulator.size):
                pointer = sparse_pointers[i]
                for j in range(sparse_ranges[i]):
                    index = sparse_indices[pointer + j]
                    weight = sparse_weights[pointer + j]
                    accumulator[i] += field[x, y, z, index] * t.x * weight
                    accumulator[i] += field[x2, y, z, index] * t.w * weight
                    accumulator[i] += field[x, y2, z, index] * t.z * weight
                    accumulator[i] += field[x2, y2, z, index] * t.y * weight

    @cuda.jit(void(Array(float32, 4, 'C', readonly=True), cuda_float[:, :, :, ::1],
                   Array(int32, 2, 'C', readonly=True),
                   Array(float32, 2, 'C', readonly=True),
                   Array(int32, 2, 'C', readonly=True),
                   Array(int32, 2, 'C', readonly=True)),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_inner(field: np.ndarray[float], projection: np.ndarray[float],
                             sparse_pointers, sparse_weights, sparse_indices, sparse_ranges):
        """ Performs the John transform of a field. Relies on a large number
        of pre-defined constants outside the kernel body. """
        index = cuda.blockIdx.y * threads_angle + cuda.threadIdx.y
        j = cuda.threadIdx.x + threads_j * (
                cuda.blockIdx.x % (
                    (max_j + threads_j - 1) // threads_j))

        if (j >= max_j) or (index >= max_index):
            return

        start_k = threads_k * (cuda.blockIdx.x // (
            (max_j + threads_j - 1) // threads_j))
        end_k = start_k + threads_k
        if end_k > max_k:
            end_k = max_k

        if start_k >= end_k:
            return

        # Define compile-time constants.
        step_size_1 = step_sizes_1[index]
        step_size_2 = step_sizes_2[index]
        k_vectors_c = k_vectors[index]
        j_vectors_c = j_vectors[index]
        s_vectors_c = s_vectors[index]
        dimensions_0_c = dimensions_0[index]
        dimensions_1_c = dimensions_1[index]
        dimensions_2_c = dimensions_2[index]
        dimensions = cuda.local.array(3, int32)
        dimensions[0] = dimensions_0_c
        dimensions[1] = dimensions_1_c
        dimensions[2] = dimensions_2_c
        direction_indices_c = direction_indices[index]
        distance_multiplier = distance_multipliers[index]

        # Could be chunked for very asymmetric samples.
        start_slice = 0
        end_slice = start_slice + dimensions[0]
        accumulator = cuda.local.array(projection_channels, cuda_float)

        for k in range(start_k, end_k):
            for i in range(projection_channels):
                accumulator[i] = 0.

            fj = cuda_float(j) + 0.5
            fk = cuda_float(k) + 0.5

            # Initial coordinates of projection.
            start_position_0 = (s_vectors_c[direction_indices_c[0]] +
                                fj * j_vectors_c[direction_indices_c[0]] +
                                fk * k_vectors_c[direction_indices_c[0]])
            start_position_1 = (s_vectors_c[direction_indices_c[1]] +
                                fj * j_vectors_c[direction_indices_c[1]] +
                                fk * k_vectors_c[direction_indices_c[1]])
            start_position_2 = (s_vectors_c[direction_indices_c[2]] +
                                fj * j_vectors_c[direction_indices_c[2]] +
                                fk * k_vectors_c[direction_indices_c[2]])

            # Centering w.r.t volume.

            position_0 = cuda_float(start_slice) + cuda_float(0.5)
            position_1 = cuda_float(step_size_1 * (cuda_float(start_slice) -
                                    0.5 * dimensions[0] - start_position_0 + 0.5) +
                                    start_position_1 + 0.5 * dimensions[1] - 0.5)
            position_2 = cuda_float(step_size_2 * (cuda_float(start_slice) -
                                    0.5 * dimensions[0] -
                                    start_position_0 + 0.5) +
                                    start_position_2 + 0.5 * dimensions[2] - 0.5)

            for i in range(start_slice, end_slice):
                bilinear_interpolation(field, direction_indices_c[0],
                                       position_0, position_1, position_2,
                                       dimensions, accumulator,
                                       sparse_pointers[index], sparse_weights[index],
                                       sparse_indices[index], sparse_ranges[index])
                position_0 += cuda_float(1.0)
                position_1 += step_size_1
                position_2 += step_size_2

            for i in range(projection_channels):
                projection[index, j, k, i] = accumulator[i] * distance_multiplier

    # Launching of kernel.
    bpg = (((max_j + threads_j - 1) // threads_j) *
           ((max_k + threads_k - 1) // threads_k),
           ((projections.shape[0] + threads_angle - 1) // threads_angle))
    tpb = (threads_j, threads_angle)
    john_transform_grid = john_transform_inner[bpg, tpb]
    sparse_pointers = cuda.to_device(sparse_pointers_all)
    sparse_weights = cuda.to_device(sparse_weights_all)
    sparse_indices = cuda.to_device(sparse_indices_all)
    sparse_ranges = cuda.to_device(sparse_ranges_all)

    def transform_with_transfer(field: np.ndarray[float],
                                projections: np.ndarray[float]) -> np.ndarray[float]:
        if _is_cuda_array(field):
            assert field.dtype == 'float32'
            device_field = cuda.as_cuda_array(field)
            transfer_projections = False
        else:
            assert field.dtype == np.float32
            device_field = cuda.to_device(field)
            transfer_projections = True

        if _is_cuda_array(projections):
            assert projections.dtype == 'float32'
            device_projections = cuda.as_cuda_array(projections)
        else:
            assert projections.dtype == np.float32
            device_projections = cuda.to_device(projections)

        john_transform_grid(device_field, device_projections, sparse_pointers,
                            sparse_weights, sparse_indices, sparse_ranges)
        if transfer_projections:
            return device_projections.copy_to_host()
        return device_projections
    return transform_with_transfer


[docs]def john_transform_adjoint_sparse_cuda(field: np.ndarray[float], projections: np.ndarray[float],
                                       sparse_matrix: tuple, unit_vector_p: np.ndarray[float],
                                       unit_vector_j: np.ndarray[float], unit_vector_k: np.ndarray[float],
                                       offsets_j: np.ndarray[float], offsets_k: np.ndarray[float]):
    """ Frontend for performing the adjoint of the John transform with parallel
    GPU computing, with a sparse index for projection-to-field mapping.

    Parameters
    ----------
    field
        The field into which the adjoint is projected, with 4 dimensions. The last index should
        have the same size as the number of tensor channels. Can be either
        a ``numpy.ndarray``, or a device array that implements the CUDA array
        interface. If a device array is given, no copying to device is needed.
    projections
        The projections from which the adjoint is calculated. Should have the same shape
        as the input data matrix.
        The first index runs over the different projection directions. Can be either
        a ``numpy.ndarray``, or a device array that implements the CUDA array
        interface. If a device array is given, no copying or synchronization is needed.
    sparse_matrix
        Contains three ``np.ndarrays``, ``indices``, ``indptr``, and ``data``.
        ``indices[indptr[i, j]]`` to ``indices[indptr[i, j+1]]`` where ``i``
        is the projection channel index and ``j`` is the projection channel index
        contains the corresponding volume channel indices.
        ``data[indptr[i, j]]`` contains the weights for each index.
    unit_vector_p
        The direction of projection in Cartesian coordinates.
    unit_vector_j
        One of the directions for the pixels of :attr:`projections`.
    unit_vector_k
        The other direction for the pixels of :attr:`projections`.
    offsets_j
        Offsets which align projections in the direction of `j`
    offsets_k
        Offsets which align projections in the direction of `k`.

    Notes
    -----
    For mathematical details, see :func:`~.john_transform.john_transform_adjoint`.
    """
    # Always false as we are sticking to float32
    if False:
        cuda_float = float64
        numpy_float = np.float64
        cuda_x4_type = cuda.float64x4
    else:
        cuda_float = float32
        numpy_float = np.float32
        cuda_x4_type = cuda.float32x4

    max_p = projections.shape[0]
    max_j = projections.shape[1]
    max_k = projections.shape[2]
    max_x = field.shape[0]
    max_y = field.shape[1]
    max_z = field.shape[2]
    # CUDA chunking and memory size constants.
    channels = field.shape[-1]
    projection_channels = projections.shape[-1]
    threads_z = int(8)
    # threads_angle = int(512)  # Not used for now.
    threads_x = int(8)
    threads_y = int(4)

    # Projection vectors. s for positioning the projection.
    p_vectors = unit_vector_p.astype(numpy_float)
    k_vectors = unit_vector_k.astype(numpy_float)
    j_vectors = unit_vector_j.astype(numpy_float)
    s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
                 unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)

    # Translate volume steps to normalized projection steps. Can add support for non-square voxels.
    vector_norm = np.einsum('...i, ...i', p_vectors, np.cross(j_vectors, k_vectors))
    norm_j = -np.cross(p_vectors, k_vectors) / vector_norm[..., None]
    norm_k = np.cross(p_vectors, j_vectors) / vector_norm[..., None]
    norm_offset_j = -np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, k_vectors)) / vector_norm
    norm_offset_k = np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, j_vectors)) / vector_norm

    sparse_pointers_all = sparse_matrix[0].astype(np.int32)
    sparse_indices_all = sparse_matrix[1].astype(np.int32)
    sparse_weights_all = sparse_matrix[2].astype(numpy_float)
    sparse_ranges_all = np.diff(sparse_pointers_all, axis=-1).astype(np.int32)

    @cuda.jit(void(Array(float32, 4, 'C', readonly=True), cuda_float,
                   cuda_float, cuda_float, cuda_float[::1],
                   Array(int32, 1, 'C', readonly=True),
                   Array(float32, 1, 'C', readonly=True),
                   Array(int32, 1, 'C', readonly=True),
                   Array(int32, 1, 'C', readonly=True)), device=True,
              lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation_projection(projection: np.ndarray[float],
                                          r0: float,
                                          r1: float,
                                          r2: float,
                                          accumulator: np.ndarray[float],
                                          sparse_pointers: np.ndarray[int],
                                          sparse_weights: np.ndarray[float],
                                          sparse_indices: np.ndarray[int],
                                          sparse_ranges: np.ndarray[int]):

        if not (-1 <= r1 < max_j and -1 <= r2 < max_k):
            return
        # At edges, use nearest-neighbor interpolation.
        if (0 <= r1 + 1 < max_j):
            r1_weight = int32(1)
        else:
            r1_weight = int32(0)

        if (0 <= r2 + 1 < max_k):
            r2_weight = int32(1)
        else:
            r2_weight = int32(0)

        if int32(r1) == -1:
            r1_edge_weight = 1
        else:
            r1_edge_weight = 0

        if int32(r2) == -1:
            r2_edge_weight = 1
        else:
            r2_edge_weight = 0

        y_weight = cuda_float((r1 - floor(r1)) * r1_weight) * (1 - r1_edge_weight)
        z_weight = cuda_float((r2 - floor(r2)) * r2_weight) * (1 - r2_edge_weight)
        x = int32(floor(r0))
        y = int32(floor(r1) + r1_edge_weight)
        y2 = y + r1_weight
        z = int32(floor(r2) + r2_edge_weight)
        z2 = z + r2_weight
        t = cuda_x4_type((1 - z_weight) * (1 - y_weight),
                         (z_weight * y_weight),
                         ((1 - z_weight) * y_weight),
                         ((1 - y_weight) * z_weight))
        for i in range(projection_channels):
            pointer = sparse_pointers[i]
            for j in range(sparse_ranges[i]):
                index = sparse_indices[pointer + j]
                weight = sparse_weights[pointer + j]
                accumulator[index] += projection[x, y, z, i] * t.x * weight
                accumulator[index] += projection[x, y2, z, i] * t.z * weight
                accumulator[index] += projection[x, y, z2, i] * t.w * weight
                accumulator[index] += projection[x, y2, z2, i] * t.y * weight

    @cuda.jit(void(cuda_float[:, :, :, ::1], Array(float32, 4, 'C', readonly=True),
                   Array(int32, 2, 'C', readonly=True),
                   Array(float32, 2, 'C', readonly=True),
                   Array(int32, 2, 'C', readonly=True),
                   Array(int32, 2, 'C', readonly=True)),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_adjoint_inner(field: np.ndarray[float], projection: np.ndarray[float],
                                     sparse_pointers, sparse_weights, sparse_indices,
                                     sparse_ranges):
        """ Performs the John transform of a field. Relies on a large number
        of pre-defined constants outside the kernel body. """
        # Indexing of volume coordinates.
        x = cuda.threadIdx.x + threads_x * (cuda.blockIdx.x % (
                       (max_x + threads_x - 1) // threads_x))
        y = cuda.threadIdx.y + threads_y * (cuda.blockIdx.x // (
                       (max_x + threads_x - 1) // threads_x))
        if (x >= max_x) or (y >= max_y):
            return

        # Stride in z.
        z = cuda.blockIdx.y * threads_z
        z_acc = cuda.local.array(channels, cuda_float)
        # Center of voxel and coordinate system.
        fx = x - 0.5 * max_x + 0.5
        fy = y - 0.5 * max_y + 0.5
        steps = min(max(max_z - z, 0), threads_z)

        # Compile time constants
        start_index = 0
        stop_index = max_p
        norm_j_c = norm_j[start_index:stop_index]
        norm_k_c = norm_k[start_index:stop_index]
        norm_offset_j_c = norm_offset_j[start_index:stop_index]
        norm_offset_k_c = norm_offset_k[start_index:stop_index]
        for ii in range(steps):
            fz = z - 0.5 * max_z + 0.5
            for j in range(channels):
                z_acc[j] = 0.0

            for a in range(start_index, stop_index):
                # Center with respect to projection.
                fj = (norm_offset_j_c[a] + fx * norm_j_c[a][0] +
                      fy * norm_j_c[a][1] + fz * norm_j_c[a][2] - 0.5)
                fk = (norm_offset_k_c[a] + fx * norm_k_c[a][0] +
                      fy * norm_k_c[a][1] + fz * norm_k_c[a][2] - 0.5)
                bilinear_interpolation_projection(projection,
                                                  a, fj, fk, z_acc,
                                                  sparse_pointers[a],
                                                  sparse_weights[a],
                                                  sparse_indices[a],
                                                  sparse_ranges[a])
            for i in range(channels):
                field[x, y, z, i] = z_acc[i]
            z += 1

    bpg = (((max_x + threads_x - 1) // threads_x) *
           ((max_y + threads_y - 1) // threads_y),
           ((max_z + threads_z - 1) // threads_z))
    tpb = (threads_x, threads_y)
    john_transform_adjoint_grid = john_transform_adjoint_inner[bpg, tpb]
    sparse_pointers = cuda.to_device(sparse_pointers_all)
    sparse_weights = cuda.to_device(sparse_weights_all)
    sparse_indices = cuda.to_device(sparse_indices_all)
    sparse_ranges = cuda.to_device(sparse_ranges_all)

    def transform_with_transfer(field: np.ndarray[float],
                                projections: np.ndarray[float]) -> np.ndarray[float]:
        if _is_cuda_array(field):
            assert field.dtype == 'float32'
            device_field = cuda.as_cuda_array(field)
        else:
            assert field.dtype == np.float32
            device_field = cuda.to_device(field)

        if _is_cuda_array(projections):
            assert projections.dtype == 'float32'
            device_projections = cuda.as_cuda_array(projections)
            transfer_field = False
        else:
            assert projections.dtype == np.float32
            device_projections = cuda.to_device(projections)
            transfer_field = True
        john_transform_adjoint_grid(device_field,
                                    device_projections,
                                    sparse_pointers,
                                    sparse_weights,
                                    sparse_indices,
                                    sparse_ranges)
        if transfer_field:
            return device_field.copy_to_host()
        return device_field

    return transform_with_transfer