Source code for mumott.core.john_transform_cuda
from typing import Any
from math import floor
from numba import config, cuda, int32, float32, float64, void, from_dtype
import numpy as np
def _is_cuda_array(item: Any):
""" Internal method which assists in debugging. """
if config.ENABLE_CUDASIM:
return False
else:
return cuda.is_cuda_array(item)
[docs]def john_transform_cuda(field: np.ndarray[float], projections: np.ndarray[float],
unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
offsets_k: np.ndarray[float]):
""" Frontend for performing the John transform with parallel
GPU computing.
Parameters
----------
field
The field to be projected, with 4 dimensions. The last index should
have the same size as the last index of ``projections``. Can be either
a `numpy.ndarray`, or a device array that implements the CUDA array
interface. If a device array is given, no copying to device is needed.
projections
A 4-dimensional numpy array where the projections are stored.
The first index runs over the different projection directions. Can be either
a `numpy.ndarray`, or a device array that implements the CUDA array
interface. If a device array is given, no copying or synchronization is needed.
unit_vector_p
The direction of projection in Cartesian coordinates.
unit_vector_j
One of the directions for the pixels of ``projection``.
unit_vector_k
The other direction for the pixels of ``projection``.
offsets_j
Offsets which align projections in the direction of `j`
offsets_k
Offsets which align projections in the direction of `k`.
Notes
-----
For mathematical details, see :func:`~.john_transform.john_transform`.
"""
# Always false for now as we are sticking to ``float32``.
if False:
cuda_float = float64
numpy_float = np.float64
cuda_x4_type = cuda.float64x4
else:
cuda_float = float32
numpy_float = np.float32
cuda_x4_type = cuda.float32x4
# Find zeroth, first and second directions for indexing. Zeroth is the maximal projection direction.
# (x, y, z), (y, x, z), (z, x, y)
direction_0_index = np.argmax(abs(unit_vector_p), axis=1).reshape(-1, 1).astype(np.int32)
direction_1_index = 1 * (direction_0_index == 0).astype(np.int32).reshape(-1, 1).astype(np.int32)
direction_2_index = (2 - (direction_0_index == 2)).astype(np.int32).reshape(-1, 1).astype(np.int32)
# Step size for direction 1 and 2.
step_sizes_1 = (np.take_along_axis(unit_vector_p, direction_1_index, 1) /
np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
step_sizes_2 = (np.take_along_axis(unit_vector_p, direction_2_index, 1) /
np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
# Shape in each of the three directions.
dimensions_0 = np.array(field.shape, dtype=numpy_float)[direction_0_index.ravel()]
dimensions_1 = np.array(field.shape, dtype=numpy_float)[direction_1_index.ravel()]
dimensions_2 = np.array(field.shape, dtype=numpy_float)[direction_2_index.ravel()]
# Correction factor for length of line when taking a one-slice step.
distance_multipliers = np.sqrt(1.0 + step_sizes_1 ** 2 + step_sizes_2 ** 2).astype(numpy_float)
# Not strictly necessary break these out, it just makes code easier to follow
max_x, max_y, max_z, channels = field.shape
number_of_projections, max_j, max_k = projections.shape[:3]
# These constants control the number of threads/lanes in each kernel in order to
# optimize memory access patterns. They are partly set from principles
# such as "sequential threads should access sequential data", and partly set
# through trial and error and profiling results. Ultimately the goal is
# to minimize repetetive computations while also optimizing cache access.
# Note that this differs from optimal access patterns for CPU threads,
# because CUDA threads are analogous to SIMD lanes, not CPU threads.
# For cases up to 8 channels, threads_j and threads_kc are used.
# For cases with larger number of channels, accessing sequential channels
# dominates performance, and so j and k are lumped together while the
# channels are separated out.
# TODO: Make this configurable by the user to allow for autotuning of
# these parameters.
# Few channels
threads_j = int(32 // channels)
threads_kc = int(8 * channels)
# Many channels
CH_CHUNKSIZE = int(min(max(2 ** floor(np.log2(channels)), 1), 8))
threads_channel = int(min(max(2 ** floor(np.log2(channels / CH_CHUNKSIZE)), 1), 32))
threads_jk = int(32 // threads_channel)
JK_CHUNKSIZE = int(1)
# Check the maximum amount of local memory needed by each thread and resize accordingly
channel_blocks = (channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel)
all_threads = np.arange(threads_channel).reshape(-1, 1)
all_blocks = np.arange(channel_blocks).reshape(1, -1)
min_channels = all_threads + all_blocks * threads_channel
channel_strides = channel_blocks * threads_channel
max_steps = (channels - min_channels + channel_strides - 1) // channel_strides
# Some numba versions can be a bit dumb with respect to integer types,
# but python int should always work for e.g. dimension arguments
ch_max_memory = int(min(CH_CHUNKSIZE, max(max_steps.ravel())))
# Indices to navigate each projection. s is the surface positioning.
k_vectors = unit_vector_k.astype(numpy_float)
j_vectors = unit_vector_j.astype(numpy_float)
s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)
# (0, 1, 2) if x main, (1, 0, 2) if y main, (2, 0, 1) if z main.
direction_indices = np.stack((direction_0_index.ravel(),
direction_1_index.ravel(),
direction_2_index.ravel()), axis=1)
# This record array is functionally a dictionary of lists of parameters for each projection.
# Note that the list of dictionaries approach did not seem to be supported in numba-CUDA.
# It is possible to use constants defined outside the kernel instead of such a
# record array, which can be more efficient, but the amount of
# global constant storage on the chip is limited, and this leads to hard-to-read implementations.
# Therefore global constants are only used for simple constants like the maximum dimensions of each
# array, not for per-projection parameters.
extra_parameters = np.recarray(
1,
dtype=[
('step_size', 'f4', (len(projections), 3,)),
('k_vector', 'f4', (len(projections), 3,)),
('j_vector', 'f4', (len(projections), 3,)),
('s_vector', 'f4', (len(projections), 3,)),
('dimensions', 'int32', (len(projections), 3,)),
('direction_indices', 'int32', (len(projections), 3,)),
('distance_multiplier', 'f4', (len(projections),)),
])
extra_parameters[0].step_size[:, 0] = 1.
extra_parameters[0].step_size[:, 1] = step_sizes_1
extra_parameters[0].step_size[:, 2] = step_sizes_2
extra_parameters[0].k_vector[:] = k_vectors
extra_parameters[0].j_vector[:] = j_vectors
extra_parameters[0].s_vector[:] = s_vectors
extra_parameters[0].dimensions[:, 0] = dimensions_0
extra_parameters[0].dimensions[:, 1] = dimensions_1
extra_parameters[0].dimensions[:, 2] = dimensions_2
extra_parameters[0].direction_indices[:] = direction_indices
extra_parameters[0].distance_multiplier[:] = distance_multipliers
# Fetch dtype of parameters to enable eager compilation.
record_type = from_dtype(extra_parameters.dtype)
extra_parameters = cuda.to_device(extra_parameters[0])
@cuda.jit(device=True, lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation(field: np.ndarray[float],
directions,
positions,
dimensions,
coordinate_directions,
accumulator: np.ndarray[float],
min_channel,
channel_count,
channel_stride):
""" Kernel for bilinear interpolation. Replaces texture interpolation."""
# Precompute booleans to ensure control over dtypes, sometimes CUDA makes
# strange casting decisions unless one is very explicit.
p1_max_index = dimensions[1] - 1
p2_max_index = dimensions[2] - 1
p1_ge_0 = positions[1] >= cuda_float(0)
p2_ge_0 = positions[2] >= cuda_float(0)
p1_lt_max_index = positions[1] < cuda_float(p1_max_index)
p2_lt_max_index = positions[2] < cuda_float(p2_max_index)
weight_1 = positions[1] - floor(positions[1])
weight_2 = positions[2] - floor(positions[2])
weight_1_comp = cuda_float(1) - weight_1
weight_2_comp = cuda_float(1) - weight_2
# Bilinear interpolation weights. Uses
# texture-style edge handling where everything out of bounds is 0.
t = cuda_x4_type(weight_1_comp * weight_2_comp * cuda_float(p1_ge_0 * p2_ge_0),
weight_2_comp * weight_1 * cuda_float(p1_lt_max_index * p2_ge_0),
weight_1_comp * weight_2 * cuda_float(p2_lt_max_index * p1_ge_0),
weight_1 * weight_2 * cuda_float(p1_lt_max_index * p2_lt_max_index))
# x0 = x1 if and only if x is the main direction of interation.
x0 = max(int32(floor(positions[coordinate_directions.x])), 0)
x1 = x0 + (directions.x != 0) * (x0 < (max_x - 1))
# y3 is different from y0 when y is not the main direction,
# but y1 and y2 depend on whether the main direction is
# x or z.
y0 = max(int32(floor(positions[coordinate_directions.y])), 0)
y0_lt_max_index = y0 < (max_y - 1)
y1 = y0 + (directions.x == 0) * y0_lt_max_index
y2 = y0 + (directions.x == 2) * y0_lt_max_index
y3 = y0 + (directions.x != 1) * y0_lt_max_index
# z0 and z1 work like x0. This is because in the direction tuples
# (x, y, z), (y, x, z), (z, x, y), y can have any position,
# but x and z can only be in the positions (0, 1) and (0, 2), respectively.
z0 = max(int32(floor(positions[coordinate_directions.z])), 0)
z1 = z0 + (directions.x != 2) * (z0 < (max_z - 1))
# Bounds checking not required, already taken care of.
# Explicit bounds checking is marginally more efficient for moderate
# channel numbers, less efficient for large channel numbers.
for i in range(channel_count):
ci = min_channel + i * channel_stride
accumulator[i] += field[x0, y0, z0, ci] * t.x
accumulator[i] += field[x1, y1, z0, ci] * t.y
accumulator[i] += field[x0, y2, z1, ci] * t.z
accumulator[i] += field[x1, y3, z1, ci] * t.w
@cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type, int32[::1]),
lineinfo=True, fastmath=True, cache=True)
def john_transform_inner(field: np.ndarray[float],
projection: np.ndarray[float],
parameters: np.ndarray,
frames):
""" Performs the John transform of a field. Relies on some
of pre-defined constants outside the kernel body. """
# The strided access to channels is complicated.
# Each thread reads at most ch_max_memory channels, separated by
# n_threads * n_blocks.
# For example, for 150 channels, calculation rules would give
# chunksize 8, 16 threads, and thus 2 blocks.
# Thread 0 in block 0 then reads channels 0, 32, 64, 96, and 128. (channel_count 5)
# Thread 1 in block 0 reads 1, 33, 65, 97, and 129. (channel_count 5)
# Thread 0 in block 1 reads 16, 48, 80, 112, 144. (channel_count 5)
# Thread 5 in block 1 reads 22, 54, 86, 118. (channel_count 4)
# In the end we can optimize away 3 CH_CHUNKSIZE when storing memory.
# 8 - 3 = 5 equals the smallest chunksize that would yield the same number of blocks,
# this is ch_max_memory.
min_channel = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
channel_stride = cuda.gridDim.x * cuda.blockDim.x
max_steps = (channels - min_channel + channel_stride - 1) // channel_stride
channel_count = min(ch_max_memory, max_steps)
if min_channel >= channels:
return
# We split the frames into groups with the same main direction, so we need an index array.
frame_index = cuda.threadIdx.z + cuda.blockDim.z * cuda.blockIdx.z
frame = frames[frame_index]
# These are all constants but passed as arguments.
step_size_1 = parameters['step_size'][frame, 1]
step_size_2 = parameters['step_size'][frame, 2]
dimensions = parameters['dimensions'][frame]
direction_indices = cuda.uint32x3(
parameters['direction_indices'][frame][0],
parameters['direction_indices'][frame][1],
parameters['direction_indices'][frame][2])
distance_multiplier = parameters['distance_multiplier'][frame]
coordinate_directions = cuda.uint32x3(
direction_indices.x != 0,
(direction_indices.x == 0) + int32(2) * (direction_indices.x == 2),
int32(2) * (direction_indices.x != 2))
positions = cuda.local.array(3, cuda_float)
# It could be advantageous to chunk in the projection direction but
# this complicates writing to the output by adding race conditions.
start_slice = int32(0)
end_slice = start_slice + dimensions[0]
accumulator = cuda.local.array(ch_max_memory, cuda_float)
# Channel dimension can be small, k may be near-contiguous, so aim for colaesced reading
jk = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
jk_stride = cuda.blockDim.y * cuda.gridDim.y
total_elements = max_j * max_k
# JK_CHUNKSIZE = 1 so this is not actually iterated over currently.
for group_id in range(JK_CHUNKSIZE):
element_id = jk + group_id * jk_stride
if element_id >= total_elements:
break
k = element_id % max_k
j = element_id // max_k
# No need to check k since a % max_k < max_k
if j >= max_j:
break
for i in range(ch_max_memory):
accumulator[i] = cuda_float(0.)
# A lot of explicit casting is necessary to avoid float64 and similar
fj = cuda_float(j) + cuda_float(0.5)
fk = cuda_float(k) + cuda_float(0.5)
# Initial coordinates using alignment etc.
start_position_0 = cuda_float(parameters['s_vector'][frame, direction_indices.x] +
fj * parameters['j_vector'][frame, direction_indices.x] +
fk * parameters['k_vector'][frame, direction_indices.x])
start_position_1 = cuda_float(parameters['s_vector'][frame, direction_indices.y] +
fj * parameters['j_vector'][frame, direction_indices.y] +
fk * parameters['k_vector'][frame, direction_indices.y])
start_position_2 = cuda_float(parameters['s_vector'][frame, direction_indices.z] +
fj * parameters['j_vector'][frame, direction_indices.z] +
fk * parameters['k_vector'][frame, direction_indices.z])
# Centering of coordinate system w.r.t volume and direction.
# Again explicit casting is needed.
positions[0] = cuda_float(start_slice) + cuda_float(0.5)
positions[1] = cuda_float(step_size_1 * (cuda_float(start_slice) -
cuda_float(0.5) * cuda_float(dimensions[0]) -
start_position_0 + cuda_float(0.5)) +
start_position_1 + cuda_float(0.5) * cuda_float(dimensions[1]) -
cuda_float(0.5))
positions[2] = cuda_float(step_size_2 * (cuda_float(start_slice) -
cuda_float(0.5) * cuda_float(dimensions[0]) -
start_position_0 + cuda_float(0.5)) +
start_position_2 + cuda_float(0.5) * cuda_float(dimensions[2]) -
cuda_float(0.5))
for i in range(start_slice, end_slice):
if ((cuda_float(-1) < positions[1] < cuda_float(dimensions[1])) and
(cuda_float(-1) < positions[2] < cuda_float(dimensions[2]))):
bilinear_interpolation(field, direction_indices,
positions, dimensions,
coordinate_directions,
accumulator, min_channel,
channel_count,
channel_stride)
positions[0] += cuda_float(1.0)
positions[1] += step_size_1
positions[2] += step_size_2
for i in range(channel_count):
ci = min_channel + i * channel_stride
projection[frame, j, k, ci] = accumulator[i] * distance_multiplier
@cuda.jit(device=True, lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation_few_channels(field: np.ndarray[float],
directions,
positions,
dimensions,
coordinate_directions,
channel_index):
""" Kernel for bilinear interpolation. Replaces texture interpolation."""
# See many-channels version for explanation.
p1_max_index = dimensions[1] - 1
p2_max_index = dimensions[2] - 1
p1_ge_0 = positions[1] >= cuda_float(0)
p2_ge_0 = positions[2] >= cuda_float(0)
p1_lt_max_index = positions[1] < cuda_float(p1_max_index)
p2_lt_max_index = positions[2] < cuda_float(p2_max_index)
weight_1 = positions[1] - floor(positions[1])
weight_2 = positions[2] - floor(positions[2])
weight_1_comp = cuda_float(1) - weight_1
weight_2_comp = cuda_float(1) - weight_2
t = cuda_x4_type(weight_1_comp * weight_2_comp * cuda_float(p1_ge_0 * p2_ge_0),
weight_2_comp * weight_1 * cuda_float(p1_lt_max_index * p2_ge_0),
weight_1_comp * weight_2 * cuda_float(p2_lt_max_index * p1_ge_0),
weight_1 * weight_2 * cuda_float(p1_lt_max_index * p2_lt_max_index))
x0 = max(int32(floor(positions[coordinate_directions.x])), 0)
x1 = x0 + (directions.x != 0) * (x0 < (max_x - 1))
y0 = max(int32(floor(positions[coordinate_directions.y])), 0)
y0_lt_max_index = y0 < (max_y - 1)
y1 = y0 + (directions.x == 0) * y0_lt_max_index
y2 = y0 + (directions.x == 2) * y0_lt_max_index
y3 = y0 + (directions.x != 1) * y0_lt_max_index
z0 = max(int32(floor(positions[coordinate_directions.z])), 0)
z1 = z0 + (directions.x != 2) * (z0 < (max_z - 1))
# Bounds checking not required, already taken care of
return (field[x0, y0, z0, channel_index] * t.x + field[x1, y1, z0, channel_index] * t.y +
field[x0, y2, z1, channel_index] * t.z + field[x1, y3, z1, channel_index] * t.w)
@cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type, int32[::1]),
lineinfo=True, fastmath=True, cache=True)
def john_transform_inner_few_channels(field: np.ndarray[float],
projection: np.ndarray[float],
parameters: np.ndarray,
frames):
""" Performs the John transform of a field. Relies on a large number
of pre-defined constants outside the kernel body. """
# First block similar to multi-channel version.
frame_index = cuda.threadIdx.z + cuda.blockDim.z * cuda.blockIdx.z
frame = frames[frame_index]
step_size_1 = parameters['step_size'][frame, 1]
step_size_2 = parameters['step_size'][frame, 2]
dimensions = parameters['dimensions'][frame]
direction_indices = cuda.uint32x3(
parameters['direction_indices'][frame][0],
parameters['direction_indices'][frame][1],
parameters['direction_indices'][frame][2])
distance_multiplier = parameters['distance_multiplier'][frame]
coordinate_directions = cuda.uint32x3(
direction_indices.x != 0,
(direction_indices.x == 0) + 2 * (direction_indices.x == 2),
2 * (direction_indices.x != 2))
positions = cuda.local.array(3, cuda_float)
start_slice = 0
end_slice = start_slice + dimensions[0]
# Use a common index for channels and k to pack thread reads/writes densely.
j = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
kc = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
channel_index = kc % channels
k = kc // channels
if (j >= max_j) or (k >= max_k):
return
# See many-channels version
fj = cuda_float(j) + cuda_float(0.5)
fk = cuda_float(k) + cuda_float(0.5)
# Initial coordinates of projection.
start_position_0 = cuda_float(parameters['s_vector'][frame, direction_indices.x] +
fj * parameters['j_vector'][frame, direction_indices.x] +
fk * parameters['k_vector'][frame, direction_indices.x])
start_position_1 = cuda_float(parameters['s_vector'][frame, direction_indices.y] +
fj * parameters['j_vector'][frame, direction_indices.y] +
fk * parameters['k_vector'][frame, direction_indices.y])
start_position_2 = cuda_float(parameters['s_vector'][frame, direction_indices.z] +
fj * parameters['j_vector'][frame, direction_indices.z] +
fk * parameters['k_vector'][frame, direction_indices.z])
# Centering w.r.t volume.
positions[0] = cuda_float(start_slice) + cuda_float(0.5)
positions[1] = cuda_float(step_size_1 * (cuda_float(start_slice) -
cuda_float(0.5) * cuda_float(dimensions[0]) -
start_position_0 + cuda_float(0.5)) +
start_position_1 + cuda_float(0.5) * cuda_float(dimensions[1]) -
cuda_float(0.5))
positions[2] = cuda_float(step_size_2 * (cuda_float(start_slice) -
cuda_float(0.5) * cuda_float(dimensions[0]) -
start_position_0 + cuda_float(0.5)) +
start_position_2 + cuda_float(0.5) * cuda_float(dimensions[2]) -
cuda_float(0.5))
accumulator = cuda_float(0)
for i in range(start_slice, end_slice):
if ((cuda_float(-1) < positions[1] < cuda_float(dimensions[1])) and
(cuda_float(-1) < positions[2] < cuda_float(dimensions[2]))):
accumulator += bilinear_interpolation_few_channels(
field, direction_indices,
positions, dimensions,
coordinate_directions,
channel_index)
positions[0] += cuda_float(1.0)
positions[1] += step_size_1
positions[2] += step_size_2
projection[frame, j, k, channel_index] = accumulator * distance_multiplier
# Divide all projections according to main direction
frames = list(range(number_of_projections))
frame_sets = [
np.array(
list(filter(lambda x: direction_0_index[x] == i, frames)), dtype=np.int32)
for i in range(3)]
john_transform_list = []
for ii in range(3):
if len(frame_sets[ii]) == 0:
john_transform_list.append(None)
continue
# Calculate number of blocks based on number of frames
if channels <= 8:
tpb = (threads_kc, threads_j, 1)
bpg = ((max_k * channels + threads_kc - 1) // (threads_kc),
(max_j + threads_j - 1) // threads_j,
len(frame_sets[ii]))
john_transform_list.append(john_transform_inner_few_channels[bpg, tpb])
else:
tpb = (threads_channel, threads_jk, 1)
bpg = (((channels + (CH_CHUNKSIZE * threads_channel) - 1) //
(threads_channel * CH_CHUNKSIZE)),
(max_j * max_k + threads_jk * JK_CHUNKSIZE - 1) // (JK_CHUNKSIZE * threads_jk),
len(frame_sets[ii]))
john_transform_list.append(john_transform_inner[bpg, tpb])
def transform_with_transfer(field: np.ndarray[float],
projections: np.ndarray[float]) -> np.ndarray[float]:
if _is_cuda_array(field):
assert field.dtype == 'float32'
device_field = cuda.as_cuda_array(field)
transfer_projections = False
else:
assert field.dtype == np.float32
device_field = cuda.to_device(field)
transfer_projections = True
if _is_cuda_array(projections):
assert projections.dtype == 'float32'
device_projections = cuda.as_cuda_array(projections)
else:
assert projections.dtype == np.float32
device_projections = cuda.to_device(projections)
for ii in range(3):
if john_transform_list[ii] is not None:
john_transform_list[ii](device_field,
device_projections,
extra_parameters,
frame_sets[ii])
if transfer_projections:
return device_projections.copy_to_host()
return device_projections
return transform_with_transfer
[docs]def john_transform_adjoint_cuda(field: np.ndarray[float], projections: np.ndarray[float],
unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
offsets_k: np.ndarray[float]):
""" Frontend for performing the adjoint of the John transform with parallel
GPU computing.
Parameters
----------
field
The field into which the adjoint is projected, with 4 dimensions. The last index should
have the same size as the last index of ``projections``. Can be either
a `numpy.ndarray`, or a device array that implements the CUDA array
interface. If a device array is given, no copying to device is needed.
projections
The projections from which the adjoint is calculated.
The first index runs over the different projection directions. Can be either
a `numpy.ndarray`, or a device array that implements the CUDA array
interface. If a device array is given, no copying or synchronization is needed.
unit_vector_p
The direction of projection in Cartesian coordinates.
unit_vector_j
One of the directions for the pixels of ``projection``.
unit_vector_k
The other direction for the pixels of ``projection``.
offsets_j
Offsets which align projections in the direction of `j`
offsets_k
Offsets which align projections in the direction of `k`.
Notes
-----
For mathematical details, see :func:`~.john_transform.john_transform_adjoint`.
"""
# Always false as we are sticking to float32
if False:
cuda_float = float64
numpy_float = np.float64
cuda_x4_type = cuda.float64x4
else:
cuda_float = float32
numpy_float = np.float32
cuda_x4_type = cuda.float32x4
max_x, max_y, max_z, channels = field.shape
number_of_projections, max_j, max_k = projections.shape[:3]
# CUDA chunking and memory size constants. See non-adjoint for general explanation.
# Here the single-channel version has a storage vector and strides in Z, whereas
# the multi-channel version strides in channels.
if channels == 1:
threads_channel = int(1)
threads_z = int(16)
threads_xy = int(4)
X_CHUNKSIZE = int(1)
Y_CHUNKSIZE = int(1)
Z_CHUNKSIZE = int(4)
CH_CHUNKSIZE = int(1)
ch_max_memory = int(1)
else:
CH_CHUNKSIZE = min(max(2 ** floor(np.log2(channels // 2)), 1), 8)
threads_channel = int(min(max(2 ** floor(np.log2(channels / CH_CHUNKSIZE)), 1), 32))
threads_z = int(max(32 // threads_channel, 4))
threads_xy = int(1)
X_CHUNKSIZE = int(1)
Y_CHUNKSIZE = int(1)
Z_CHUNKSIZE = int(2)
channel_blocks = (channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel)
all_threads = np.arange(threads_channel).reshape(-1, 1)
all_blocks = np.arange(channel_blocks).reshape(1, -1)
min_channels = all_threads + all_blocks * threads_channel
channel_strides = channel_blocks * threads_channel
max_steps = (channels - min_channels + channel_strides - 1) // channel_strides
# Some numba versions can be a bit dumb with respect to integer types,
# but python int should always work for e.g. dimension arguments
ch_max_memory = int(min(CH_CHUNKSIZE, max(max_steps.ravel())))
# Projection vectors. s for positioning the projection.
p_vectors = unit_vector_p.astype(numpy_float)
k_vectors = unit_vector_k.astype(numpy_float)
j_vectors = unit_vector_j.astype(numpy_float)
s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)
# Translate volume steps to normalized projection steps. Can add support for non-square voxels.
vector_norm = np.einsum('...i, ...i', p_vectors, np.cross(j_vectors, k_vectors))
norm_j = -np.cross(p_vectors, k_vectors) / vector_norm[..., None]
norm_k = np.cross(p_vectors, j_vectors) / vector_norm[..., None]
norm_offset_j = -np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, k_vectors)) / vector_norm
norm_offset_k = np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, j_vectors)) / vector_norm
# See forward version for explanation. Adjoint faces bigger issues with global constants since
# each kernel can potentially end up copying all of the frame-wise values.
extra_parameters = np.recarray(
1,
dtype=[
('norm_j', 'f4', (len(projections), 3,)),
('norm_k', 'f4', (len(projections), 3,)),
('norm_offset_j', 'f4', (len(projections),)),
('norm_offset_k', 'f4', (len(projections),)),])
extra_parameters[0].norm_j = norm_j
extra_parameters[0].norm_k = norm_k
extra_parameters[0].norm_offset_j = norm_offset_j
extra_parameters[0].norm_offset_k = norm_offset_k
record_type = from_dtype(extra_parameters.dtype)
extra_parameters = cuda.to_device(extra_parameters[0])
@cuda.jit(device=True,
lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation_projection(projection: np.ndarray[float],
index, fj, fk,
accumulator: np.ndarray[float],
min_channel, channel_stride, channel_count):
""" Bilinearly interpolates between 4 neighbouring pixels on each projection.
"""
# Functions similar to forward bilinear interpolation, but is simpler
# since we do not need to consider directions of integration, as the
# integral is always over all projections.
j_max_ind = max_j - 1
k_max_ind = max_k - 1
fj_ge_0 = fj >= cuda_float(0)
fk_ge_0 = fk >= cuda_float(0)
fj_lt_max_ind = fj < cuda_float(j_max_ind)
fk_lt_max_ind = fk < cuda_float(k_max_ind)
y_weight = fj - floor(fj)
z_weight = fk - floor(fk)
y0 = max(int32(floor(fj)), 0)
y1 = y0 + (y0 < j_max_ind)
z0 = max(int32(floor(fk)), 0)
z1 = z0 + (z0 < k_max_ind)
y_weight_complement = cuda_float(1) - y_weight
z_weight_complement = cuda_float(1) - z_weight
t = cuda_x4_type(
y_weight_complement * z_weight_complement * cuda_float(fj_ge_0 * fk_ge_0),
z_weight_complement * y_weight * cuda_float(fj_lt_max_ind * fk_ge_0),
y_weight_complement * z_weight * cuda_float(fk_lt_max_ind * fj_ge_0),
z_weight * y_weight * cuda_float(fj_lt_max_ind * fk_lt_max_ind))
for i in range(channel_count):
ci = min_channel + i * channel_stride
accumulator[i] += (projection[index, y0, z0, ci] * t.x +
projection[index, y1, z0, ci] * t.y +
projection[index, y0, z1, ci] * t.z +
projection[index, y1, z1, ci] * t.w)
@cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type),
lineinfo=True, fastmath=True, cache=True)
def john_transform_adjoint_inner(field: np.ndarray[float],
projection: np.ndarray[float],
parameters: np.ndarray):
""" Performs the John transform of a field. Relies some pre-defined
constants outside the kernel body. """
# See forward version for explanation.
min_channel = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
channel_stride = cuda.gridDim.x * cuda.blockDim.x
max_steps = (channels - min_channel + channel_stride - 1) // channel_stride
channel_count = min(ch_max_memory, max_steps)
if min_channel >= channels:
return
# x and y are not contiguous, no need to do fancy indexing
xy = (cuda.blockIdx.z * cuda.blockDim.z + cuda.threadIdx.z)
y_num_chunks = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
start_y = (xy % y_num_chunks) * Y_CHUNKSIZE
end_y = min(start_y + Y_CHUNKSIZE, max_y)
if start_y >= max_y:
return
start_x = (xy // y_num_chunks) * X_CHUNKSIZE
end_x = min(start_x + X_CHUNKSIZE, max_x)
if start_x >= max_x:
return
# There are tradeoffs between read-write access and amount of local memory needed
# in structuring this array. Since moving between projections requires a
# lot of computation it seems that accumulating values for several Z is helpful.
# Could try out shared memory and 1 block per frame, but probably not better.
z_acc = cuda.local.array((Z_CHUNKSIZE, ch_max_memory), cuda_float)
z = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
z_stride = cuda.gridDim.y * cuda.blockDim.y
if z >= max_z:
return
start_index = 0
stop_index = number_of_projections
# Coordinates in projection space relative to xyz-space.
norm_j = parameters['norm_j'][start_index:stop_index]
norm_k = parameters['norm_k'][start_index:stop_index]
norm_offset_j = parameters['norm_offset_j'][start_index:stop_index]
norm_offset_k = parameters['norm_offset_k'][start_index:stop_index]
for x in range(start_x, end_x):
for y in range(start_y, end_y):
# Center volume.
fx = cuda_float(x) - cuda_float(0.5) * cuda_float(max_x) + cuda_float(0.5)
fy = cuda_float(y) - cuda_float(0.5) * cuda_float(max_y) + cuda_float(0.5)
fz = cuda_float(z) - cuda_float(0.5) * cuda_float(max_z) + cuda_float(0.5)
for j in range(Z_CHUNKSIZE):
for k in range(channel_count):
z_acc[j, k] = cuda_float(0.0)
for a in range(start_index, stop_index):
# Center with respect to each projection direction and alignment.
fj = (norm_offset_j[a] + fx * norm_j[a][0] +
fy * norm_j[a][1] + fz * norm_j[a][2] - cuda_float(0.5))
fk = (norm_offset_k[a] + fx * norm_k[a][0] +
fy * norm_k[a][1] + fz * norm_k[a][2] - cuda_float(0.5))
for j in range(Z_CHUNKSIZE):
# Striding in z is cheaper than striding across projections
if (cuda_float(-1) < fj < cuda_float(max_j) and
cuda_float(-1) < fk < cuda_float(max_k)):
bilinear_interpolation_projection(projection,
a, fj, fk, z_acc[j],
min_channel, channel_stride, channel_count)
fj += norm_j[a][2] * cuda_float(z_stride)
fk += norm_k[a][2] * cuda_float(z_stride)
for j in range(Z_CHUNKSIZE):
# Some threads might be at the very edge.
if z + j * z_stride >= max_z:
break
for k in range(channel_count):
field[x, y, z + j * z_stride, min_channel + k * channel_stride] = z_acc[j, k]
@cuda.jit(device=True,
lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation_projection_one_channel(projection: np.ndarray[float],
index, fj, fk,
accumulator: np.ndarray[float],
i: int):
# See many-channel and forward versions.
j_max_ind = max_j - 1
k_max_ind = max_k - 1
fj_ge_0 = fj >= cuda_float(0)
fk_ge_0 = fk >= cuda_float(0)
fj_lt_max_ind = fj < cuda_float(j_max_ind)
fk_lt_max_ind = fk < cuda_float(k_max_ind)
y_weight = fj - floor(fj)
z_weight = fk - floor(fk)
y0 = max(int32(floor(fj)), 0)
y1 = y0 + (y0 < j_max_ind)
z0 = max(int32(floor(fk)), 0)
z1 = z0 + (z0 < k_max_ind)
y_weight_complement = cuda_float(1) - y_weight
z_weight_complement = cuda_float(1) - z_weight
t = cuda_x4_type(
y_weight_complement * z_weight_complement * cuda_float(fj_ge_0 * fk_ge_0),
z_weight_complement * y_weight * cuda_float(fj_lt_max_ind * fk_ge_0),
y_weight_complement * z_weight * cuda_float(fk_lt_max_ind * fj_ge_0),
z_weight * y_weight * cuda_float(fj_lt_max_ind * fk_lt_max_ind))
accumulator[i] += (projection[index, y0, z0, 0] * t.x +
projection[index, y1, z0, 0] * t.y +
projection[index, y0, z1, 0] * t.z +
projection[index, y1, z1, 0] * t.w)
@cuda.jit(void(cuda_float[:, :, :, ::1], cuda_float[:, :, :, ::1], record_type),
lineinfo=True, fastmath=True, cache=True)
def john_transform_adjoint_one_channel(field: np.ndarray[float],
projection: np.ndarray[float],
parameters: np.ndarray):
""" Performs the John transform of a single-channel field."""
# This entire function is very similar to the multi-channel one, except that
# striding happens only in Z.
z = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y
z_stride = cuda.gridDim.y * cuda.blockDim.y
xy = (cuda.blockIdx.z * cuda.blockDim.z + cuda.threadIdx.z)
y_num_chunks = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
start_y = (xy % y_num_chunks) * Y_CHUNKSIZE
end_y = min(start_y + Y_CHUNKSIZE, max_y)
if start_y >= max_y:
return
start_x = (xy // y_num_chunks) * X_CHUNKSIZE
end_x = min(start_x + X_CHUNKSIZE, max_x)
if start_x >= max_x:
return
# Stride in z.
z_acc = cuda.local.array(Z_CHUNKSIZE, cuda_float)
start_index = 0
stop_index = number_of_projections
norm_j = parameters['norm_j'][start_index:stop_index]
norm_k = parameters['norm_k'][start_index:stop_index]
norm_offset_j = parameters['norm_offset_j'][start_index:stop_index]
norm_offset_k = parameters['norm_offset_k'][start_index:stop_index]
for x in range(start_x, end_x):
for y in range(start_y, end_y):
# Center of voxel and coordinate system.
fx = cuda_float(x) - cuda_float(0.5) * cuda_float(max_x) + cuda_float(0.5)
fy = cuda_float(y) - cuda_float(0.5) * cuda_float(max_y) + cuda_float(0.5)
fz = cuda_float(z) - cuda_float(0.5) * cuda_float(max_z) + cuda_float(0.5)
for j in range(Z_CHUNKSIZE):
z_acc[j] = 0.0
for a in range(start_index, stop_index):
# Center with respect to projection.
fj = (norm_offset_j[a] + fx * norm_j[a][0] + fy *
norm_j[a][1] + fz * norm_j[a][2] - cuda_float(0.5))
fk = (norm_offset_k[a] + fx * norm_k[a][0] + fy *
norm_k[a][1] + fz * norm_k[a][2] - cuda_float(0.5))
for i in range(Z_CHUNKSIZE):
if (cuda_float(-1) < fj < cuda_float(max_j) and
cuda_float(-1) < fk < cuda_float(max_k)):
bilinear_interpolation_projection_one_channel(
projection,
a, fj, fk,
z_acc, i)
fj += norm_j[a][2] * cuda_float(z_stride)
fk += norm_k[a][2] * cuda_float(z_stride)
for i in range(Z_CHUNKSIZE):
if z + i * z_stride < max_z:
field[x, y, z + i * z_stride, 0] = z_acc[i]
num_chunks_x = (max_x + X_CHUNKSIZE - 1) // X_CHUNKSIZE
num_chunks_y = (max_y + Y_CHUNKSIZE - 1) // Y_CHUNKSIZE
xy_chunks = num_chunks_x * num_chunks_y
bpg = ((channels + threads_channel * CH_CHUNKSIZE - 1) // (CH_CHUNKSIZE * threads_channel),
(max_z + Z_CHUNKSIZE * threads_z - 1) // (threads_z * Z_CHUNKSIZE),
(xy_chunks + threads_xy - 1) // threads_xy,
)
tpb = (threads_channel, threads_z, threads_xy)
if channels > 1:
john_transform_adjoint_grid = john_transform_adjoint_inner[bpg, tpb]
else:
john_transform_adjoint_grid = john_transform_adjoint_one_channel[bpg, tpb]
def transform_with_transfer(field: np.ndarray[float],
projections: np.ndarray[float]) -> np.ndarray[float]:
if _is_cuda_array(field):
assert field.dtype == 'float32'
device_field = cuda.as_cuda_array(field)
else:
assert field.dtype == np.float32
device_field = cuda.to_device(field)
if _is_cuda_array(projections):
assert projections.dtype == 'float32'
device_projections = cuda.as_cuda_array(projections)
transfer_field = False
else:
assert projections.dtype == np.float32
device_projections = cuda.to_device(projections)
transfer_field = True
john_transform_adjoint_grid(device_field,
device_projections,
extra_parameters)
if transfer_field:
return device_field.copy_to_host()
return device_field
return transform_with_transfer