Source code for mumott.core.john_transform_cuda

from typing import Any

from math import floor
from numba import config, cuda, int32, float32, float64, void, from_dtype
import numpy as np


def _is_cuda_array(item: Any):
    """ Internal method which assists in debugging. """
    if config.ENABLE_CUDASIM:
        return False
    else:
        return cuda.is_cuda_array(item)


[docs]def john_transform_cuda(field: np.ndarray[float], projections: np.ndarray[float],
                        unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
                        unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
                        offsets_k: np.ndarray[float]):
    """ Frontend for performing the John transform with parallel
    GPU computing.

    Parameters
    ----------
    field
        The field to be projected, with 4 dimensions. The last index should
        have the same size as the last index of ``projections``. Can be either
        a `numpy.ndarray`, or a device array that implements the CUDA array
        interface. If a device array is given, no copying to device is needed.
    projections
        A 4-dimensional numpy array where the projections are stored.
        The first index runs over the different projection directions. Can be either
        a `numpy.ndarray`, or a device array that implements the CUDA array
        interface. If a device array is given, no copying or synchronization is needed.
    unit_vector_p
        The direction of projection in Cartesian coordinates.
    unit_vector_j
        One of the directions for the pixels of ``projection``.
    unit_vector_k
        The other direction for the pixels of ``projection``.
    offsets_j
        Offsets which align projections in the direction of `j`
    offsets_k
        Offsets which align projections in the direction of `k`.

    Notes
    -----
    For mathematical details, see :func:`~.john_transform.john_transform`.
    """
    # Always false for now as we are sticking to ``float32``.
    if False:
        cuda_float = float64
        numpy_float = np.float64
        cuda_x4_type = cuda.float64x4
    else:
        cuda_float = float32
        numpy_float = np.float32
        cuda_x4_type = cuda.float32x4

    # Find zeroth, first and second directions for indexing. Zeroth is the maximal projection direction.

    # (x, y, z), (y, x, z), (z, x, y)
    direction_0_index = np.argmax(abs(unit_vector_p), axis=1).reshape(-1, 1).astype(np.int32)
    direction_1_index = 1 * (direction_0_index == 0).astype(np.int32).reshape(-1, 1).astype(np.int32)
    direction_2_index = (2 - (direction_0_index == 2)).astype(np.int32).reshape(-1, 1).astype(np.int32)

    # Step size for direction 1 and 2.
    step_sizes_1 = (np.take_along_axis(unit_vector_p, direction_1_index, 1) /
                    np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
    step_sizes_2 = (np.take_along_axis(unit_vector_p, direction_2_index, 1) /
                    np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()

    # Shape in each of the three directions.
    dimensions_0 = np.array(field.shape, dtype=numpy_float)[direction_0_index.ravel()]
    dimensions_1 = np.array(field.shape, dtype=numpy_float)[direction_1_index.ravel()]
    dimensions_2 = np.array(field.shape, dtype=numpy_float)[direction_2_index.ravel()]

    # Correction factor for length of line when taking a one-slice step.
    distance_multipliers = np.sqrt(1.0 + step_sizes_1 ** 2 + step_sizes_2 ** 2).astype(numpy_float)

    # Not strictly necessary break these out, it just makes code easier to follow
    max_x, max_y, max_z, channels = field.shape
    number_of_projections, max_j, max_k = projections.shape[:3]

    # These constants control the number of threads/lanes in each kernel in order to
    # optimize memory access patterns. They are partly set from principles
    # such as "sequential threads should access sequential data", and partly set
    # through trial and error and profiling results. Ultimately the goal is
    # to minimize repetetive computations while also optimizing cache access.
    # Note that this differs from optimal access patterns for CPU threads,
    # because CUDA threads are analogous to SIMD lanes, not CPU threads.
    # For cases up to 8 channels, threads_j and threads_kc are used.
    # For cases with larger number of channels, accessing sequential channels
    # dominates performance, and so j and k are lumped together while the
    # channels are separated out.
    # TODO: Make this configurable by the user to allow for autotuning of
    # these parameters.

    # Few channels
    threads_j = int(32 // channels)
    threads_kc = int(8 * channels)
    # Many channels
    CH_CHUNKSIZE = int(min(max(2 ** floor(np.log2(channels)), 1), 8))
    threads_channel = int(min(max(2 ** floor(np.log2(channels / CH_CHUNKSIZE)), 1), 32))
    threads_jk = int(32 // threads_channel)
    JK_CHUNKSIZE = int(1)
    # Check the maximum amount of local memory needed by each thread and resize accordingly
    channel_blocks = (channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel)
    all_threads = np.arange(threads_channel).reshape(-1, 1)
    all_blocks = np.arange(channel_blocks).reshape(1, -1)
    min_channels = all_threads + all_blocks * threads_channel
    channel_strides = channel_blocks * threads_channel
    max_steps = (channels - min_channels + channel_strides - 1) // channel_strides
    # Some numba versions can be a bit dumb with respect to integer types,
    # but python int should always work for e.g. dimension arguments
    ch_max_memory = int(min(CH_CHUNKSIZE, max(max_steps.ravel())))

    # Indices to navigate each projection. s is the surface positioning.
    k_vectors = unit_vector_k.astype(numpy_float)
    j_vectors = unit_vector_j.astype(numpy_float)
    s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
                 unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)

    # (0, 1, 2) if x main, (1, 0, 2) if y main, (2, 0, 1) if z main.
    direction_indices = np.stack((direction_0_index.ravel(),
                                  direction_1_index.ravel(),
                                  direction_2_index.ravel()), axis=1)

    # This record array is functionally a dictionary of lists of parameters for each projection.
    # Note that the list of dictionaries approach did not seem to be supported in numba-CUDA.
    # It is possible to use constants defined outside the kernel instead of such a
    # record array, which can be more efficient, but the amount of
    # global constant storage on the chip is limited, and this leads to hard-to-read implementations.
    # Therefore global constants are only used for simple constants like the maximum dimensions of each
    # array, not for per-projection parameters.
    extra_parameters = np.recarray(
        1,
        dtype=[
            ('step_size', 'f4', (len(projections), 3,)),
            ('k_vector', 'f4', (len(projections), 3,)),
            ('j_vector', 'f4', (len(projections), 3,)),
            ('s_vector', 'f4', (len(projections), 3,)),
            ('dimensions', 'int32', (len(projections), 3,)),
            ('direction_indices', 'int32', (len(projections), 3,)),
            ('distance_multiplier', 'f4', (len(projections),)),
        ])

    extra_parameters[0].step_size[:, 0] = 1.
    extra_parameters[0].step_size[:, 1] = step_sizes_1
    extra_parameters[0].step_size[:, 2] = step_sizes_2
    extra_parameters[0].k_vector[:] = k_vectors
    extra_parameters[0].j_vector[:] = j_vectors
    extra_parameters[0].s_vector[:] = s_vectors
    extra_parameters[0].dimensions[:, 0] = dimensions_0
    extra_parameters[0].dimensions[:, 1] = dimensions_1
    extra_parameters[0].dimensions[:, 2] = dimensions_2
    extra_parameters[0].direction_indices[:] = direction_indices
    extra_parameters[0].distance_multiplier[:] = distance_multipliers
    # Fetch dtype of parameters to enable eager compilation.
    record_type = from_dtype(extra_parameters.dtype)
    extra_parameters = cuda.to_device(extra_parameters[0])

    @cuda.jit(device=True, lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation(field: np.ndarray[float],
                               directions,
                               positions,
                               dimensions,
                               coordinate_directions,
                               accumulator: np.ndarray[float],
                               min_channel,
                               channel_count,
                               channel_stride):
        """ Kernel for bilinear interpolation. Replaces texture interpolation."""
        # Precompute booleans to ensure control over dtypes, sometimes CUDA makes
        # strange casting decisions unless one is very explicit.
        p1_max_index = dimensions[1] - 1
        p2_max_index = dimensions[2] - 1
        p1_ge_0 = positions[1] >= cuda_float(0)
        p2_ge_0 = positions[2] >= cuda_float(0)
        p1_lt_max_index = positions[1] < cuda_float(p1_max_index)
        p2_lt_max_index = positions[2] < cuda_float(p2_max_index)
        weight_1 = positions[1] - floor(positions[1])
        weight_2 = positions[2] - floor(positions[2])
        weight_1_comp = cuda_float(1) - weight_1
        weight_2_comp = cuda_float(1) - weight_2
        # Bilinear interpolation weights. Uses
        # texture-style edge handling where everything out of bounds is 0.
        t = cuda_x4_type(weight_1_comp * weight_2_comp * cuda_float(p1_ge_0 * p2_ge_0),
                         weight_2_comp * weight_1 * cuda_float(p1_lt_max_index * p2_ge_0),
                         weight_1_comp * weight_2 * cuda_float(p2_lt_max_index * p1_ge_0),
                         weight_1 * weight_2 * cuda_float(p1_lt_max_index * p2_lt_max_index))

        # x0 = x1 if and only if x is the main direction of interation.
        x0 = max(int32(floor(positions[coordinate_directions.x])), 0)
        x1 = x0 + (directions.x != 0) * (x0 < (max_x - 1))

        # y3 is different from y0 when y is not the main direction,
        # but y1 and y2 depend on whether the main direction is
        # x or z.
        y0 = max(int32(floor(positions[coordinate_directions.y])), 0)
        y0_lt_max_index = y0 < (max_y - 1)
        y1 = y0 + (directions.x == 0) * y0_lt_max_index
        y2 = y0 + (directions.x == 2) * y0_lt_max_index
        y3 = y0 + (directions.x != 1) * y0_lt_max_index

        # z0 and z1 work like x0. This is because in the direction tuples
        # (x, y, z), (y, x, z), (z, x, y), y can have any position,
        # but x and z can only be in the positions (0, 1) and (0, 2), respectively.
        z0 = max(int32(floor(positions[coordinate_directions.z])), 0)
        z1 = z0 + (directions.x != 2) * (z0 < (max_z - 1))

        # Bounds checking not required, already taken care of.
        # Explicit bounds checking is marginally more efficient for moderate
        # channel numbers, less efficient for large channel numbers.
        for i in range(channel_count):
            ci = min_channel + i * channel_stride
            accumulator[i] += field[x0, y0, z0, ci] * t.x
            accumulator[i] += field[x1, y1, z0, ci] * t.y
            accumulator[i] += field[x0, y2, z1, ci] * t.z
            accumulator[i] += field[x1, y3, z1, ci] * t.w

    @cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type, int32[::1]),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_inner(field: np.ndarray[float],
                             projection: np.ndarray[float],
                             parameters: np.ndarray,
                             frames):
        """ Performs the John transform of a field. Relies on some
        of pre-defined constants outside the kernel body. """

        # The strided access to channels is complicated.
        # Each thread reads at most ch_max_memory channels, separated by
        # n_threads * n_blocks.
        # For example, for 150 channels, calculation rules would give
        # chunksize 8, 16 threads, and thus 2 blocks.
        # Thread 0 in block 0 then reads channels 0, 32, 64, 96, and 128. (channel_count 5)
        # Thread 1 in block 0 reads 1, 33, 65, 97, and 129. (channel_count 5)
        # Thread 0 in block 1 reads 16, 48, 80, 112, 144. (channel_count 5)
        # Thread 5 in block 1 reads 22, 54, 86, 118. (channel_count 4)
        # In the end we can optimize away 3 CH_CHUNKSIZE when storing memory.
        # 8 - 3 = 5 equals the smallest chunksize that would yield the same number of blocks,
        # this is ch_max_memory.
        min_channel = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
        channel_stride = cuda.gridDim.x * cuda.blockDim.x
        max_steps = (channels - min_channel + channel_stride - 1) // channel_stride
        channel_count = min(ch_max_memory, max_steps)
        if min_channel >= channels:
            return

        # We split the frames into groups with the same main direction, so we need an index array.
        frame_index = cuda.threadIdx.z + cuda.blockDim.z * cuda.blockIdx.z
        frame = frames[frame_index]

        # These are all constants but passed as arguments.
        step_size_1 = parameters['step_size'][frame, 1]
        step_size_2 = parameters['step_size'][frame, 2]
        dimensions = parameters['dimensions'][frame]
        direction_indices = cuda.uint32x3(
            parameters['direction_indices'][frame][0],
            parameters['direction_indices'][frame][1],
            parameters['direction_indices'][frame][2])
        distance_multiplier = parameters['distance_multiplier'][frame]
        coordinate_directions = cuda.uint32x3(
            direction_indices.x != 0,
            (direction_indices.x == 0) + int32(2) * (direction_indices.x == 2),
            int32(2) * (direction_indices.x != 2))
        positions = cuda.local.array(3, cuda_float)

        # It could be advantageous to chunk in the projection direction but
        # this complicates writing to the output by adding race conditions.
        start_slice = int32(0)
        end_slice = start_slice + dimensions[0]
        accumulator = cuda.local.array(ch_max_memory, cuda_float)

        # Channel dimension can be small, k may be near-contiguous, so aim for colaesced reading
        jk = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
        jk_stride = cuda.blockDim.y * cuda.gridDim.y
        total_elements = max_j * max_k
        # JK_CHUNKSIZE = 1 so this is not actually iterated over currently.
        for group_id in range(JK_CHUNKSIZE):
            element_id = jk + group_id * jk_stride
            if element_id >= total_elements:
                break
            k = element_id % max_k
            j = element_id // max_k

            # No need to check k since a % max_k < max_k
            if j >= max_j:
                break

            for i in range(ch_max_memory):
                accumulator[i] = cuda_float(0.)

            # A lot of explicit casting is necessary to avoid float64 and similar
            fj = cuda_float(j) + cuda_float(0.5)
            fk = cuda_float(k) + cuda_float(0.5)

            # Initial coordinates using alignment etc.
            start_position_0 = cuda_float(parameters['s_vector'][frame, direction_indices.x] +
                                          fj * parameters['j_vector'][frame, direction_indices.x] +
                                          fk * parameters['k_vector'][frame, direction_indices.x])
            start_position_1 = cuda_float(parameters['s_vector'][frame, direction_indices.y] +
                                          fj * parameters['j_vector'][frame, direction_indices.y] +
                                          fk * parameters['k_vector'][frame, direction_indices.y])
            start_position_2 = cuda_float(parameters['s_vector'][frame, direction_indices.z] +
                                          fj * parameters['j_vector'][frame, direction_indices.z] +
                                          fk * parameters['k_vector'][frame, direction_indices.z])

            # Centering of coordinate system w.r.t volume and direction.
            # Again explicit casting is needed.
            positions[0] = cuda_float(start_slice) + cuda_float(0.5)
            positions[1] = cuda_float(step_size_1 * (cuda_float(start_slice) -
                                                     cuda_float(0.5) * cuda_float(dimensions[0]) -
                                                     start_position_0 + cuda_float(0.5)) +
                                      start_position_1 + cuda_float(0.5) * cuda_float(dimensions[1]) -
                                      cuda_float(0.5))
            positions[2] = cuda_float(step_size_2 * (cuda_float(start_slice) -
                                                     cuda_float(0.5) * cuda_float(dimensions[0]) -
                                                     start_position_0 + cuda_float(0.5)) +
                                      start_position_2 + cuda_float(0.5) * cuda_float(dimensions[2]) -
                                      cuda_float(0.5))

            for i in range(start_slice, end_slice):
                if ((cuda_float(-1) < positions[1] < cuda_float(dimensions[1])) and
                        (cuda_float(-1) < positions[2] < cuda_float(dimensions[2]))):
                    bilinear_interpolation(field, direction_indices,
                                           positions, dimensions,
                                           coordinate_directions,
                                           accumulator, min_channel,
                                           channel_count,
                                           channel_stride)
                positions[0] += cuda_float(1.0)
                positions[1] += step_size_1
                positions[2] += step_size_2

            for i in range(channel_count):
                ci = min_channel + i * channel_stride
                projection[frame, j, k, ci] = accumulator[i] * distance_multiplier

    @cuda.jit(device=True, lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation_few_channels(field: np.ndarray[float],
                                            directions,
                                            positions,
                                            dimensions,
                                            coordinate_directions,
                                            channel_index):
        """ Kernel for bilinear interpolation. Replaces texture interpolation."""
        # See many-channels version for explanation.
        p1_max_index = dimensions[1] - 1
        p2_max_index = dimensions[2] - 1
        p1_ge_0 = positions[1] >= cuda_float(0)
        p2_ge_0 = positions[2] >= cuda_float(0)
        p1_lt_max_index = positions[1] < cuda_float(p1_max_index)
        p2_lt_max_index = positions[2] < cuda_float(p2_max_index)
        weight_1 = positions[1] - floor(positions[1])
        weight_2 = positions[2] - floor(positions[2])
        weight_1_comp = cuda_float(1) - weight_1
        weight_2_comp = cuda_float(1) - weight_2
        t = cuda_x4_type(weight_1_comp * weight_2_comp * cuda_float(p1_ge_0 * p2_ge_0),
                         weight_2_comp * weight_1 * cuda_float(p1_lt_max_index * p2_ge_0),
                         weight_1_comp * weight_2 * cuda_float(p2_lt_max_index * p1_ge_0),
                         weight_1 * weight_2 * cuda_float(p1_lt_max_index * p2_lt_max_index))

        x0 = max(int32(floor(positions[coordinate_directions.x])), 0)
        x1 = x0 + (directions.x != 0) * (x0 < (max_x - 1))

        y0 = max(int32(floor(positions[coordinate_directions.y])), 0)
        y0_lt_max_index = y0 < (max_y - 1)
        y1 = y0 + (directions.x == 0) * y0_lt_max_index
        y2 = y0 + (directions.x == 2) * y0_lt_max_index
        y3 = y0 + (directions.x != 1) * y0_lt_max_index

        z0 = max(int32(floor(positions[coordinate_directions.z])), 0)
        z1 = z0 + (directions.x != 2) * (z0 < (max_z - 1))
        # Bounds checking not required, already taken care of
        return (field[x0, y0, z0, channel_index] * t.x + field[x1, y1, z0, channel_index] * t.y +
                field[x0, y2, z1, channel_index] * t.z + field[x1, y3, z1, channel_index] * t.w)

    @cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type, int32[::1]),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_inner_few_channels(field: np.ndarray[float],
                                          projection: np.ndarray[float],
                                          parameters: np.ndarray,
                                          frames):
        """ Performs the John transform of a field. Relies on a large number
        of pre-defined constants outside the kernel body. """
        # First block similar to multi-channel version.
        frame_index = cuda.threadIdx.z + cuda.blockDim.z * cuda.blockIdx.z
        frame = frames[frame_index]
        step_size_1 = parameters['step_size'][frame, 1]
        step_size_2 = parameters['step_size'][frame, 2]
        dimensions = parameters['dimensions'][frame]
        direction_indices = cuda.uint32x3(
            parameters['direction_indices'][frame][0],
            parameters['direction_indices'][frame][1],
            parameters['direction_indices'][frame][2])
        distance_multiplier = parameters['distance_multiplier'][frame]
        coordinate_directions = cuda.uint32x3(
            direction_indices.x != 0,
            (direction_indices.x == 0) + 2 * (direction_indices.x == 2),
            2 * (direction_indices.x != 2))
        positions = cuda.local.array(3, cuda_float)
        start_slice = 0
        end_slice = start_slice + dimensions[0]
        # Use a common index for channels and k to pack thread reads/writes densely.
        j = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
        kc = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
        channel_index = kc % channels
        k = kc // channels
        if (j >= max_j) or (k >= max_k):
            return

        # See many-channels version
        fj = cuda_float(j) + cuda_float(0.5)
        fk = cuda_float(k) + cuda_float(0.5)

        # Initial coordinates of projection.
        start_position_0 = cuda_float(parameters['s_vector'][frame, direction_indices.x] +
                                      fj * parameters['j_vector'][frame, direction_indices.x] +
                                      fk * parameters['k_vector'][frame, direction_indices.x])
        start_position_1 = cuda_float(parameters['s_vector'][frame, direction_indices.y] +
                                      fj * parameters['j_vector'][frame, direction_indices.y] +
                                      fk * parameters['k_vector'][frame, direction_indices.y])
        start_position_2 = cuda_float(parameters['s_vector'][frame, direction_indices.z] +
                                      fj * parameters['j_vector'][frame, direction_indices.z] +
                                      fk * parameters['k_vector'][frame, direction_indices.z])

        # Centering w.r.t volume.
        positions[0] = cuda_float(start_slice) + cuda_float(0.5)
        positions[1] = cuda_float(step_size_1 * (cuda_float(start_slice) -
                                                 cuda_float(0.5) * cuda_float(dimensions[0]) -
                                                 start_position_0 + cuda_float(0.5)) +
                                  start_position_1 + cuda_float(0.5) * cuda_float(dimensions[1]) -
                                  cuda_float(0.5))
        positions[2] = cuda_float(step_size_2 * (cuda_float(start_slice) -
                                                 cuda_float(0.5) * cuda_float(dimensions[0]) -
                                                 start_position_0 + cuda_float(0.5)) +
                                  start_position_2 + cuda_float(0.5) * cuda_float(dimensions[2]) -
                                  cuda_float(0.5))
        accumulator = cuda_float(0)
        for i in range(start_slice, end_slice):
            if ((cuda_float(-1) < positions[1] < cuda_float(dimensions[1])) and
                    (cuda_float(-1) < positions[2] < cuda_float(dimensions[2]))):
                accumulator += bilinear_interpolation_few_channels(
                    field, direction_indices,
                    positions, dimensions,
                    coordinate_directions,
                    channel_index)
            positions[0] += cuda_float(1.0)
            positions[1] += step_size_1
            positions[2] += step_size_2

        projection[frame, j, k, channel_index] = accumulator * distance_multiplier

    # Divide all projections according to main direction
    frames = list(range(number_of_projections))
    frame_sets = [
        np.array(
            list(filter(lambda x: direction_0_index[x] == i, frames)), dtype=np.int32)
        for i in range(3)]

    john_transform_list = []
    for ii in range(3):
        if len(frame_sets[ii]) == 0:
            john_transform_list.append(None)
            continue
        # Calculate number of blocks based on number of frames
        if channels <= 8:
            tpb = (threads_kc, threads_j, 1)
            bpg = ((max_k * channels + threads_kc - 1) // (threads_kc),
                   (max_j + threads_j - 1) // threads_j,
                   len(frame_sets[ii]))
            john_transform_list.append(john_transform_inner_few_channels[bpg, tpb])
        else:
            tpb = (threads_channel, threads_jk, 1)
            bpg = (((channels + (CH_CHUNKSIZE * threads_channel) - 1) //
                   (threads_channel * CH_CHUNKSIZE)),
                   (max_j * max_k + threads_jk * JK_CHUNKSIZE - 1) // (JK_CHUNKSIZE * threads_jk),
                   len(frame_sets[ii]))
            john_transform_list.append(john_transform_inner[bpg, tpb])

    def transform_with_transfer(field: np.ndarray[float],
                                projections: np.ndarray[float]) -> np.ndarray[float]:
        if _is_cuda_array(field):
            assert field.dtype == 'float32'
            device_field = cuda.as_cuda_array(field)
            transfer_projections = False
        else:
            assert field.dtype == np.float32
            device_field = cuda.to_device(field)
            transfer_projections = True

        if _is_cuda_array(projections):
            assert projections.dtype == 'float32'
            device_projections = cuda.as_cuda_array(projections)
        else:
            assert projections.dtype == np.float32
            device_projections = cuda.to_device(projections)

        for ii in range(3):
            if john_transform_list[ii] is not None:
                john_transform_list[ii](device_field,
                                        device_projections,
                                        extra_parameters,
                                        frame_sets[ii])
        if transfer_projections:
            return device_projections.copy_to_host()
        return device_projections
    return transform_with_transfer


[docs]def john_transform_adjoint_cuda(field: np.ndarray[float], projections: np.ndarray[float],
                                unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
                                unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
                                offsets_k: np.ndarray[float]):
    """ Frontend for performing the adjoint of the John transform with parallel
    GPU computing.

    Parameters
    ----------
    field
        The field into which the adjoint is projected, with 4 dimensions. The last index should
        have the same size as the last index of ``projections``. Can be either
        a `numpy.ndarray`, or a device array that implements the CUDA array
        interface. If a device array is given, no copying to device is needed.
    projections
        The projections from which the adjoint is calculated.
        The first index runs over the different projection directions. Can be either
        a `numpy.ndarray`, or a device array that implements the CUDA array
        interface. If a device array is given, no copying or synchronization is needed.
    unit_vector_p
        The direction of projection in Cartesian coordinates.
    unit_vector_j
        One of the directions for the pixels of ``projection``.
    unit_vector_k
        The other direction for the pixels of ``projection``.
    offsets_j
        Offsets which align projections in the direction of `j`
    offsets_k
        Offsets which align projections in the direction of `k`.

    Notes
    -----
    For mathematical details, see :func:`~.john_transform.john_transform_adjoint`.
    """
    # Always false as we are sticking to float32
    if False:
        cuda_float = float64
        numpy_float = np.float64
        cuda_x4_type = cuda.float64x4
    else:
        cuda_float = float32
        numpy_float = np.float32
        cuda_x4_type = cuda.float32x4

    max_x, max_y, max_z, channels = field.shape
    number_of_projections, max_j, max_k = projections.shape[:3]

    # CUDA chunking and memory size constants. See non-adjoint for general explanation.
    # Here the single-channel version has a storage vector and strides in Z, whereas
    # the multi-channel version strides in channels.
    if channels == 1:
        threads_channel = int(1)
        threads_z = int(16)
        threads_xy = int(4)
        X_CHUNKSIZE = int(1)
        Y_CHUNKSIZE = int(1)
        Z_CHUNKSIZE = int(4)
        CH_CHUNKSIZE = int(1)
        ch_max_memory = int(1)
    else:
        CH_CHUNKSIZE = min(max(2 ** floor(np.log2(channels // 2)), 1), 8)
        threads_channel = int(min(max(2 ** floor(np.log2(channels / CH_CHUNKSIZE)), 1), 32))
        threads_z = int(max(32 // threads_channel, 4))
        threads_xy = int(1)
        X_CHUNKSIZE = int(1)
        Y_CHUNKSIZE = int(1)
        Z_CHUNKSIZE = int(2)
        channel_blocks = (channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel)
        all_threads = np.arange(threads_channel).reshape(-1, 1)
        all_blocks = np.arange(channel_blocks).reshape(1, -1)
        min_channels = all_threads + all_blocks * threads_channel
        channel_strides = channel_blocks * threads_channel
        max_steps = (channels - min_channels + channel_strides - 1) // channel_strides
        # Some numba versions can be a bit dumb with respect to integer types,
        # but python int should always work for e.g. dimension arguments
        ch_max_memory = int(min(CH_CHUNKSIZE, max(max_steps.ravel())))
    # Projection vectors. s for positioning the projection.
    p_vectors = unit_vector_p.astype(numpy_float)
    k_vectors = unit_vector_k.astype(numpy_float)
    j_vectors = unit_vector_j.astype(numpy_float)
    s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
                 unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)

    # Translate volume steps to normalized projection steps. Can add support for non-square voxels.
    vector_norm = np.einsum('...i, ...i', p_vectors, np.cross(j_vectors, k_vectors))
    norm_j = -np.cross(p_vectors, k_vectors) / vector_norm[..., None]
    norm_k = np.cross(p_vectors, j_vectors) / vector_norm[..., None]
    norm_offset_j = -np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, k_vectors)) / vector_norm
    norm_offset_k = np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, j_vectors)) / vector_norm

    # See forward version for explanation. Adjoint faces bigger issues with global constants since
    # each kernel can potentially end up copying all of the frame-wise values.
    extra_parameters = np.recarray(
        1,
        dtype=[
            ('norm_j', 'f4', (len(projections), 3,)),
            ('norm_k', 'f4', (len(projections), 3,)),
            ('norm_offset_j', 'f4', (len(projections),)),
            ('norm_offset_k', 'f4', (len(projections),)),])

    extra_parameters[0].norm_j = norm_j
    extra_parameters[0].norm_k = norm_k
    extra_parameters[0].norm_offset_j = norm_offset_j
    extra_parameters[0].norm_offset_k = norm_offset_k
    record_type = from_dtype(extra_parameters.dtype)
    extra_parameters = cuda.to_device(extra_parameters[0])

    @cuda.jit(device=True,
              lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation_projection(projection: np.ndarray[float],
                                          index, fj, fk,
                                          accumulator: np.ndarray[float],
                                          min_channel, channel_stride, channel_count):
        """ Bilinearly interpolates between 4 neighbouring pixels on each projection.
        """
        # Functions similar to forward bilinear interpolation, but is simpler
        # since we do not need to consider directions of integration, as the
        # integral is always over all projections.
        j_max_ind = max_j - 1
        k_max_ind = max_k - 1
        fj_ge_0 = fj >= cuda_float(0)
        fk_ge_0 = fk >= cuda_float(0)
        fj_lt_max_ind = fj < cuda_float(j_max_ind)
        fk_lt_max_ind = fk < cuda_float(k_max_ind)
        y_weight = fj - floor(fj)
        z_weight = fk - floor(fk)
        y0 = max(int32(floor(fj)), 0)
        y1 = y0 + (y0 < j_max_ind)
        z0 = max(int32(floor(fk)), 0)
        z1 = z0 + (z0 < k_max_ind)
        y_weight_complement = cuda_float(1) - y_weight
        z_weight_complement = cuda_float(1) - z_weight
        t = cuda_x4_type(
            y_weight_complement * z_weight_complement * cuda_float(fj_ge_0 * fk_ge_0),
            z_weight_complement * y_weight * cuda_float(fj_lt_max_ind * fk_ge_0),
            y_weight_complement * z_weight * cuda_float(fk_lt_max_ind * fj_ge_0),
            z_weight * y_weight * cuda_float(fj_lt_max_ind * fk_lt_max_ind))
        for i in range(channel_count):
            ci = min_channel + i * channel_stride
            accumulator[i] += (projection[index, y0, z0, ci] * t.x +
                               projection[index, y1, z0, ci] * t.y +
                               projection[index, y0, z1, ci] * t.z +
                               projection[index, y1, z1, ci] * t.w)

    @cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_adjoint_inner(field: np.ndarray[float],
                                     projection: np.ndarray[float],
                                     parameters: np.ndarray):
        """ Performs the John transform of a field. Relies some pre-defined
        constants outside the kernel body. """
        # See forward version for explanation.
        min_channel = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
        channel_stride = cuda.gridDim.x * cuda.blockDim.x
        max_steps = (channels - min_channel + channel_stride - 1) // channel_stride
        channel_count = min(ch_max_memory, max_steps)
        if min_channel >= channels:
            return
        # x and y are not contiguous, no need to do fancy indexing
        xy = (cuda.blockIdx.z * cuda.blockDim.z + cuda.threadIdx.z)
        y_num_chunks = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
        start_y = (xy % y_num_chunks) * Y_CHUNKSIZE
        end_y = min(start_y + Y_CHUNKSIZE, max_y)
        if start_y >= max_y:
            return
        start_x = (xy // y_num_chunks) * X_CHUNKSIZE
        end_x = min(start_x + X_CHUNKSIZE, max_x)
        if start_x >= max_x:
            return
        # There are tradeoffs between read-write access and amount of local memory needed
        # in structuring this array. Since moving between projections requires a
        # lot of computation it seems that accumulating values for several Z is helpful.
        # Could try out shared memory and 1 block per frame, but probably not better.
        z_acc = cuda.local.array((Z_CHUNKSIZE, ch_max_memory), cuda_float)
        z = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
        z_stride = cuda.gridDim.y * cuda.blockDim.y
        if z >= max_z:
            return

        start_index = 0
        stop_index = number_of_projections
        # Coordinates in projection space relative to xyz-space.
        norm_j = parameters['norm_j'][start_index:stop_index]
        norm_k = parameters['norm_k'][start_index:stop_index]
        norm_offset_j = parameters['norm_offset_j'][start_index:stop_index]
        norm_offset_k = parameters['norm_offset_k'][start_index:stop_index]
        for x in range(start_x, end_x):
            for y in range(start_y, end_y):
                # Center volume.
                fx = cuda_float(x) - cuda_float(0.5) * cuda_float(max_x) + cuda_float(0.5)
                fy = cuda_float(y) - cuda_float(0.5) * cuda_float(max_y) + cuda_float(0.5)
                fz = cuda_float(z) - cuda_float(0.5) * cuda_float(max_z) + cuda_float(0.5)
                for j in range(Z_CHUNKSIZE):
                    for k in range(channel_count):
                        z_acc[j, k] = cuda_float(0.0)

                for a in range(start_index, stop_index):
                    # Center with respect to each projection direction and alignment.
                    fj = (norm_offset_j[a] + fx * norm_j[a][0] +
                          fy * norm_j[a][1] + fz * norm_j[a][2] - cuda_float(0.5))
                    fk = (norm_offset_k[a] + fx * norm_k[a][0] +
                          fy * norm_k[a][1] + fz * norm_k[a][2] - cuda_float(0.5))
                    for j in range(Z_CHUNKSIZE):
                        # Striding in z is cheaper than striding across projections
                        if (cuda_float(-1) < fj < cuda_float(max_j) and
                                cuda_float(-1) < fk < cuda_float(max_k)):
                            bilinear_interpolation_projection(projection,
                                                              a, fj, fk, z_acc[j],
                                                              min_channel, channel_stride, channel_count)
                        fj += norm_j[a][2] * cuda_float(z_stride)
                        fk += norm_k[a][2] * cuda_float(z_stride)
                for j in range(Z_CHUNKSIZE):
                    # Some threads might be at the very edge.
                    if z + j * z_stride >= max_z:
                        break
                    for k in range(channel_count):
                        field[x, y, z + j * z_stride, min_channel + k * channel_stride] = z_acc[j, k]

    @cuda.jit(device=True,
              lineinfo=True, fastmath=True, cache=True)
    def bilinear_interpolation_projection_one_channel(projection: np.ndarray[float],
                                                      index, fj, fk,
                                                      accumulator: np.ndarray[float],
                                                      i: int):
        # See many-channel and forward versions.
        j_max_ind = max_j - 1
        k_max_ind = max_k - 1
        fj_ge_0 = fj >= cuda_float(0)
        fk_ge_0 = fk >= cuda_float(0)
        fj_lt_max_ind = fj < cuda_float(j_max_ind)
        fk_lt_max_ind = fk < cuda_float(k_max_ind)
        y_weight = fj - floor(fj)
        z_weight = fk - floor(fk)
        y0 = max(int32(floor(fj)), 0)
        y1 = y0 + (y0 < j_max_ind)
        z0 = max(int32(floor(fk)), 0)
        z1 = z0 + (z0 < k_max_ind)
        y_weight_complement = cuda_float(1) - y_weight
        z_weight_complement = cuda_float(1) - z_weight
        t = cuda_x4_type(
            y_weight_complement * z_weight_complement * cuda_float(fj_ge_0 * fk_ge_0),
            z_weight_complement * y_weight * cuda_float(fj_lt_max_ind * fk_ge_0),
            y_weight_complement * z_weight * cuda_float(fk_lt_max_ind * fj_ge_0),
            z_weight * y_weight * cuda_float(fj_lt_max_ind * fk_lt_max_ind))
        accumulator[i] += (projection[index, y0, z0, 0] * t.x +
                           projection[index, y1, z0, 0] * t.y +
                           projection[index, y0, z1, 0] * t.z +
                           projection[index, y1, z1, 0] * t.w)

    @cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type),
              lineinfo=True, fastmath=True, cache=True)
    def john_transform_adjoint_one_channel(field: np.ndarray[float],
                                           projection: np.ndarray[float],
                                           parameters: np.ndarray):
        """ Performs the John transform of a single-channel field."""

        # This entire function is very similar to the multi-channel one, except that
        # striding happens only in Z.
        z = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
        z_stride = cuda.gridDim.y * cuda.blockDim.y
        xy = (cuda.blockIdx.z * cuda.blockDim.z + cuda.threadIdx.z)
        y_num_chunks = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
        start_y = (xy % y_num_chunks) * Y_CHUNKSIZE
        end_y = min(start_y + Y_CHUNKSIZE, max_y)
        if start_y >= max_y:
            return
        start_x = (xy // y_num_chunks) * X_CHUNKSIZE
        end_x = min(start_x + X_CHUNKSIZE, max_x)
        if start_x >= max_x:
            return
        # Stride in z.
        z_acc = cuda.local.array(Z_CHUNKSIZE, cuda_float)
        start_index = 0
        stop_index = number_of_projections
        norm_j = parameters['norm_j'][start_index:stop_index]
        norm_k = parameters['norm_k'][start_index:stop_index]
        norm_offset_j = parameters['norm_offset_j'][start_index:stop_index]
        norm_offset_k = parameters['norm_offset_k'][start_index:stop_index]
        for x in range(start_x, end_x):
            for y in range(start_y, end_y):
                # Center of voxel and coordinate system.
                fx = cuda_float(x) - cuda_float(0.5) * cuda_float(max_x) + cuda_float(0.5)
                fy = cuda_float(y) - cuda_float(0.5) * cuda_float(max_y) + cuda_float(0.5)
                fz = cuda_float(z) - cuda_float(0.5) * cuda_float(max_z) + cuda_float(0.5)
                for j in range(Z_CHUNKSIZE):
                    z_acc[j] = 0.0
                for a in range(start_index, stop_index):
                    # Center with respect to projection.
                    fj = (norm_offset_j[a] + fx * norm_j[a][0] + fy *
                          norm_j[a][1] + fz * norm_j[a][2] - cuda_float(0.5))
                    fk = (norm_offset_k[a] + fx * norm_k[a][0] + fy *
                          norm_k[a][1] + fz * norm_k[a][2] - cuda_float(0.5))
                    for i in range(Z_CHUNKSIZE):
                        if (cuda_float(-1) < fj < cuda_float(max_j) and
                                cuda_float(-1) < fk < cuda_float(max_k)):
                            bilinear_interpolation_projection_one_channel(
                                projection,
                                a, fj, fk,
                                z_acc, i)
                        fj += norm_j[a][2] * cuda_float(z_stride)
                        fk += norm_k[a][2] * cuda_float(z_stride)
                for i in range(Z_CHUNKSIZE):
                    if z + i * z_stride < max_z:
                        field[x, y, z + i * z_stride, 0] = z_acc[i]
    num_chunks_x = (max_x + X_CHUNKSIZE - 1) // X_CHUNKSIZE
    num_chunks_y = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
    xy_chunks = num_chunks_x * num_chunks_y
    bpg = ((channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel),
           (max_z + Z_CHUNKSIZE * threads_z - 1) // (threads_z * Z_CHUNKSIZE),
           (xy_chunks + threads_xy - 1) // threads_xy,
           )
    tpb = (threads_channel, threads_z, threads_xy)
    if channels > 1:
        john_transform_adjoint_grid = john_transform_adjoint_inner[bpg, tpb]
    else:
        john_transform_adjoint_grid = john_transform_adjoint_one_channel[bpg, tpb]

    def transform_with_transfer(field: np.ndarray[float],
                                projections: np.ndarray[float]) -> np.ndarray[float]:
        if _is_cuda_array(field):
            assert field.dtype == 'float32'
            device_field = cuda.as_cuda_array(field)
        else:
            assert field.dtype == np.float32
            device_field = cuda.to_device(field)

        if _is_cuda_array(projections):
            assert projections.dtype == 'float32'
            device_projections = cuda.as_cuda_array(projections)
            transfer_field = False
        else:
            assert projections.dtype == np.float32
            device_projections = cuda.to_device(projections)
            transfer_field = True
        john_transform_adjoint_grid(device_field,
                                    device_projections,
                                    extra_parameters)
        if transfer_field:
            return device_field.copy_to_host()
        return device_field

    return transform_with_transfer