Source code for mumott.core.john_transform_sparse_cuda
from typing import Any
from math import floor
from numba import config, cuda, int32, float32, float64, void
from numba.types import Array
import numpy as np
def _is_cuda_array(item: Any):
""" Internal method which assists in debugging. """
if config.ENABLE_CUDASIM:
return False
else:
return cuda.is_cuda_array(item)
[docs]def john_transform_sparse_cuda(field: np.ndarray[float], projections: np.ndarray[float], sparse_matrix: tuple,
unit_vector_p: np.ndarray[float], unit_vector_j: np.ndarray[float],
unit_vector_k: np.ndarray[float], offsets_j: np.ndarray[float],
offsets_k: np.ndarray[float]):
""" Frontend for performing the John transform with parallel
GPU computing, with a sparse matrix for projection-to-field mapping.
Parameters
----------
field
The field to be projected, with 4 dimensions. The last index should
have the same size as the number of tensor channels. Can be either
a ``numpy.ndarray``, or a device array that implements the CUDA array
interface. If a device array is given, no copying to device is needed.
projections
A 4-dimensional numpy array where the projections are stored. Should have the
same shape as the input data matrix.
The first index runs over the different projection directions. Can be either
a ``numpy.ndarray``, or a device array that implements the CUDA array
interface. If a device array is given, no copying or synchronization is needed.
sparse_matrix
Contains three ``np.ndarrays``, ``indices``, ``indptr``, and ``data``.
``indices[indptr[i, j]]`` to ``indices[indptr[i, j+1]]`` where ``i``
is the projection channel index and ``j`` is the projection channel index
contains the corresponding volume channel indices.
``data[indptr[i, j]]`` contains the weights for each index.
unit_vector_p
The direction of projection in Cartesian coordinates.
unit_vector_j
One of the directions for the pixels of :attr:`projections`.
unit_vector_k
The other direction for the pixels of :attr:`projections`.
offsets_j
Offsets which align projections in the direction of `j`
offsets_k
Offsets which align projections in the direction of `k`
Notes
-----
For mathematical details, see :func:`~.john_transform.john_transform`.
This transform also encodes the detector-to-tensor transform using a sparse matrix.
"""
# Always false for now as we are sticking to ``float32``.
if False:
cuda_float = float64
numpy_float = np.float64
cuda_x4_type = cuda.float64x4
else:
cuda_float = float32
numpy_float = np.float32
cuda_x4_type = cuda.float32x4
# Find zeroth, first and second directions for indexing. Zeroth is the maximal projection direction.
# (x, y, z), (y, x, z), (z, x, y)
direction_0_index = np.argmax(abs(unit_vector_p), axis=1).reshape(-1, 1).astype(np.int32)
direction_1_index = 1 * (direction_0_index == 0).astype(np.int32).reshape(-1, 1).astype(np.int32)
direction_2_index = (2 - (direction_0_index == 2)).astype(np.int32).reshape(-1, 1).astype(np.int32)
# Step size for direction 1 and 2.
step_sizes_1 = (np.take_along_axis(unit_vector_p, direction_1_index, 1) /
np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
step_sizes_2 = (np.take_along_axis(unit_vector_p, direction_2_index, 1) /
np.take_along_axis(unit_vector_p, direction_0_index, 1)).astype(numpy_float).ravel()
# Shape in each of the three directions.
dimensions_0 = np.array(field.shape, dtype=numpy_float)[direction_0_index.ravel()]
dimensions_1 = np.array(field.shape, dtype=numpy_float)[direction_1_index.ravel()]
dimensions_2 = np.array(field.shape, dtype=numpy_float)[direction_2_index.ravel()]
# Correction factor for length of line when taking a one-slice step.
distance_multipliers = np.sqrt(1.0 + step_sizes_1 ** 2 + step_sizes_2 ** 2).astype(numpy_float)
max_index = projections.shape[0]
max_j = projections.shape[1]
max_k = projections.shape[2]
# CUDA chunking and memory size constants.
projection_channels = int(projections.shape[-1])
threads_j = int(8)
threads_k = int(8)
threads_angle = int(4)
# Indices to navigate each projection. s is the surface positioning.
k_vectors = unit_vector_k.astype(numpy_float)
j_vectors = unit_vector_j.astype(numpy_float)
s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)
# (0, 1, 2) if x main, (1, 0, 2) if y main, (2, 0, 1) if z main.
direction_indices = np.stack((direction_0_index.ravel(),
direction_1_index.ravel(),
direction_2_index.ravel()), axis=1)
# Bilinear interpolation over each slice.
sparse_pointers_all = sparse_matrix[0].astype(np.int32)
sparse_indices_all = sparse_matrix[1].astype(np.int32)
sparse_weights_all = sparse_matrix[2].astype(numpy_float)
sparse_ranges_all = np.diff(sparse_pointers_all, axis=-1).astype(np.int32)
@cuda.jit(void(Array(float32, 4, 'C', readonly=True), int32, cuda_float,
cuda_float, cuda_float, int32[::1], cuda_float[::1],
Array(int32, 1, 'C', readonly=True),
Array(float32, 1, 'C', readonly=True),
Array(int32, 1, 'C', readonly=True),
Array(int32, 1, 'C', readonly=True)), device=True,
lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation(field: np.ndarray[float],
direction_0,
r0: float,
r1: float,
r2: float,
dimensions,
accumulator: np.ndarray[float],
sparse_pointers: np.ndarray[int],
sparse_weights: np.ndarray[float],
sparse_indices: np.ndarray[int],
sparse_ranges: np.ndarray[int]):
""" Kernel for bilinear interpolation. Replaces texture interpolation."""
if not (-1 <= r1 < dimensions[1] and -1 <= r2 < dimensions[2]):
return
# At edges, interpolate between value and 0.
if (0 <= r1 < dimensions[1] - 1):
r1_weight = int32(1)
else:
r1_weight = int32(0)
if (0 <= r2 < dimensions[2] - 1):
r2_weight = int32(1)
else:
r2_weight = int32(0)
if int32(r1) == -1:
r1_edge_weight = int32(1)
else:
r1_edge_weight = int32(0)
if int32(r2) == -1:
r2_edge_weight = int32(1)
else:
r2_edge_weight = int32(0)
weight_1 = cuda_float((r1 - floor(r1)) * r1_weight) * (1 - r1_edge_weight)
weight_2 = cuda_float((r2 - floor(r2)) * r2_weight) * (1 - r2_edge_weight)
t = cuda_x4_type((1 - weight_1) * (1 - weight_2),
(weight_1 * weight_2),
((1 - weight_1) * weight_2),
((1 - weight_2) * weight_1))
# Branch should be abstracted away by compiler, but could be done with pointer arithmetic.
if (direction_0 == 0):
x = int32(floor(r0))
y = int32(floor(r1) + r1_edge_weight)
y2 = y + r1_weight
z = int32(floor(r2) + r2_edge_weight)
z2 = z + r2_weight
for i in range(accumulator.size):
pointer = sparse_pointers[i]
for j in range(sparse_ranges[i]):
index = sparse_indices[pointer + j]
weight = sparse_weights[pointer + j]
accumulator[i] += field[x, y, z, index] * t.x * weight
accumulator[i] += field[x, y2, z, index] * t.w * weight
accumulator[i] += field[x, y, z2, index] * t.z * weight
accumulator[i] += field[x, y2, z2, index] * t.y * weight
elif (direction_0 == 1):
x = int32(floor(r1) + r1_edge_weight)
x2 = x + r1_weight
y = int32(floor(r0))
z = int32(floor(r2) + r2_edge_weight)
z2 = z + r2_weight
for i in range(accumulator.size):
pointer = sparse_pointers[i]
for j in range(sparse_ranges[i]):
index = sparse_indices[pointer + j]
weight = sparse_weights[pointer + j]
accumulator[i] += field[x, y, z, index] * t.x * weight
accumulator[i] += field[x2, y, z, index] * t.w * weight
accumulator[i] += field[x, y, z2, index] * t.z * weight
accumulator[i] += field[x2, y, z2, index] * t.y * weight
elif (direction_0 == 2):
x = int32(floor(r1) + r1_edge_weight)
x2 = x + r1_weight
y = int32(floor(r2) + r2_edge_weight)
y2 = y + r2_weight
z = int32(floor(r0))
for i in range(accumulator.size):
pointer = sparse_pointers[i]
for j in range(sparse_ranges[i]):
index = sparse_indices[pointer + j]
weight = sparse_weights[pointer + j]
accumulator[i] += field[x, y, z, index] * t.x * weight
accumulator[i] += field[x2, y, z, index] * t.w * weight
accumulator[i] += field[x, y2, z, index] * t.z * weight
accumulator[i] += field[x2, y2, z, index] * t.y * weight
@cuda.jit(void(Array(float32, 4, 'C', readonly=True), cuda_float[:, :, :, ::1],
Array(int32, 2, 'C', readonly=True),
Array(float32, 2, 'C', readonly=True),
Array(int32, 2, 'C', readonly=True),
Array(int32, 2, 'C', readonly=True)),
lineinfo=True, fastmath=True, cache=True)
def john_transform_inner(field: np.ndarray[float], projection: np.ndarray[float],
sparse_pointers, sparse_weights, sparse_indices, sparse_ranges):
""" Performs the John transform of a field. Relies on a large number
of pre-defined constants outside the kernel body. """
index = cuda.blockIdx.y * threads_angle + cuda.threadIdx.y
j = cuda.threadIdx.x + threads_j * (
cuda.blockIdx.x % (
(max_j + threads_j - 1) // threads_j))
if (j >= max_j) or (index >= max_index):
return
start_k = threads_k * (cuda.blockIdx.x // (
(max_j + threads_j - 1) // threads_j))
end_k = start_k + threads_k
if end_k > max_k:
end_k = max_k
if start_k >= end_k:
return
# Define compile-time constants.
step_size_1 = step_sizes_1[index]
step_size_2 = step_sizes_2[index]
k_vectors_c = k_vectors[index]
j_vectors_c = j_vectors[index]
s_vectors_c = s_vectors[index]
dimensions_0_c = dimensions_0[index]
dimensions_1_c = dimensions_1[index]
dimensions_2_c = dimensions_2[index]
dimensions = cuda.local.array(3, int32)
dimensions[0] = dimensions_0_c
dimensions[1] = dimensions_1_c
dimensions[2] = dimensions_2_c
direction_indices_c = direction_indices[index]
distance_multiplier = distance_multipliers[index]
# Could be chunked for very asymmetric samples.
start_slice = 0
end_slice = start_slice + dimensions[0]
accumulator = cuda.local.array(projection_channels, cuda_float)
for k in range(start_k, end_k):
for i in range(projection_channels):
accumulator[i] = 0.
fj = cuda_float(j) + 0.5
fk = cuda_float(k) + 0.5
# Initial coordinates of projection.
start_position_0 = (s_vectors_c[direction_indices_c[0]] +
fj * j_vectors_c[direction_indices_c[0]] +
fk * k_vectors_c[direction_indices_c[0]])
start_position_1 = (s_vectors_c[direction_indices_c[1]] +
fj * j_vectors_c[direction_indices_c[1]] +
fk * k_vectors_c[direction_indices_c[1]])
start_position_2 = (s_vectors_c[direction_indices_c[2]] +
fj * j_vectors_c[direction_indices_c[2]] +
fk * k_vectors_c[direction_indices_c[2]])
# Centering w.r.t volume.
position_0 = cuda_float(start_slice) + cuda_float(0.5)
position_1 = cuda_float(step_size_1 * (cuda_float(start_slice) -
0.5 * dimensions[0] - start_position_0 + 0.5) +
start_position_1 + 0.5 * dimensions[1] - 0.5)
position_2 = cuda_float(step_size_2 * (cuda_float(start_slice) -
0.5 * dimensions[0] -
start_position_0 + 0.5) +
start_position_2 + 0.5 * dimensions[2] - 0.5)
for i in range(start_slice, end_slice):
bilinear_interpolation(field, direction_indices_c[0],
position_0, position_1, position_2,
dimensions, accumulator,
sparse_pointers[index], sparse_weights[index],
sparse_indices[index], sparse_ranges[index])
position_0 += cuda_float(1.0)
position_1 += step_size_1
position_2 += step_size_2
for i in range(projection_channels):
projection[index, j, k, i] = accumulator[i] * distance_multiplier
# Launching of kernel.
bpg = (((max_j + threads_j - 1) // threads_j) *
((max_k + threads_k - 1) // threads_k),
((projections.shape[0] + threads_angle - 1) // threads_angle))
tpb = (threads_j, threads_angle)
john_transform_grid = john_transform_inner[bpg, tpb]
sparse_pointers = cuda.to_device(sparse_pointers_all)
sparse_weights = cuda.to_device(sparse_weights_all)
sparse_indices = cuda.to_device(sparse_indices_all)
sparse_ranges = cuda.to_device(sparse_ranges_all)
def transform_with_transfer(field: np.ndarray[float],
projections: np.ndarray[float]) -> np.ndarray[float]:
if _is_cuda_array(field):
assert field.dtype == 'float32'
device_field = cuda.as_cuda_array(field)
transfer_projections = False
else:
assert field.dtype == np.float32
device_field = cuda.to_device(field)
transfer_projections = True
if _is_cuda_array(projections):
assert projections.dtype == 'float32'
device_projections = cuda.as_cuda_array(projections)
else:
assert projections.dtype == np.float32
device_projections = cuda.to_device(projections)
john_transform_grid(device_field, device_projections, sparse_pointers,
sparse_weights, sparse_indices, sparse_ranges)
if transfer_projections:
return device_projections.copy_to_host()
return device_projections
return transform_with_transfer
[docs]def john_transform_adjoint_sparse_cuda(field: np.ndarray[float], projections: np.ndarray[float],
sparse_matrix: tuple, unit_vector_p: np.ndarray[float],
unit_vector_j: np.ndarray[float], unit_vector_k: np.ndarray[float],
offsets_j: np.ndarray[float], offsets_k: np.ndarray[float]):
""" Frontend for performing the adjoint of the John transform with parallel
GPU computing, with a sparse index for projection-to-field mapping.
Parameters
----------
field
The field into which the adjoint is projected, with 4 dimensions. The last index should
have the same size as the number of tensor channels. Can be either
a ``numpy.ndarray``, or a device array that implements the CUDA array
interface. If a device array is given, no copying to device is needed.
projections
The projections from which the adjoint is calculated. Should have the same shape
as the input data matrix.
The first index runs over the different projection directions. Can be either
a ``numpy.ndarray``, or a device array that implements the CUDA array
interface. If a device array is given, no copying or synchronization is needed.
sparse_matrix
Contains three ``np.ndarrays``, ``indices``, ``indptr``, and ``data``.
``indices[indptr[i, j]]`` to ``indices[indptr[i, j+1]]`` where ``i``
is the projection channel index and ``j`` is the projection channel index
contains the corresponding volume channel indices.
``data[indptr[i, j]]`` contains the weights for each index.
unit_vector_p
The direction of projection in Cartesian coordinates.
unit_vector_j
One of the directions for the pixels of :attr:`projections`.
unit_vector_k
The other direction for the pixels of :attr:`projections`.
offsets_j
Offsets which align projections in the direction of `j`
offsets_k
Offsets which align projections in the direction of `k`.
Notes
-----
For mathematical details, see :func:`~.john_transform.john_transform_adjoint`.
"""
# Always false as we are sticking to float32
if False:
cuda_float = float64
numpy_float = np.float64
cuda_x4_type = cuda.float64x4
else:
cuda_float = float32
numpy_float = np.float32
cuda_x4_type = cuda.float32x4
max_p = projections.shape[0]
max_j = projections.shape[1]
max_k = projections.shape[2]
max_x = field.shape[0]
max_y = field.shape[1]
max_z = field.shape[2]
# CUDA chunking and memory size constants.
channels = field.shape[-1]
projection_channels = projections.shape[-1]
threads_z = int(8)
# threads_angle = int(512) # Not used for now.
threads_x = int(8)
threads_y = int(4)
# Projection vectors. s for positioning the projection.
p_vectors = unit_vector_p.astype(numpy_float)
k_vectors = unit_vector_k.astype(numpy_float)
j_vectors = unit_vector_j.astype(numpy_float)
s_vectors = (unit_vector_k * (-0.5 * max_k + offsets_k.reshape(-1, 1)) +
unit_vector_j * (-0.5 * max_j + offsets_j.reshape(-1, 1))).astype(numpy_float)
# Translate volume steps to normalized projection steps. Can add support for non-square voxels.
vector_norm = np.einsum('...i, ...i', p_vectors, np.cross(j_vectors, k_vectors))
norm_j = -np.cross(p_vectors, k_vectors) / vector_norm[..., None]
norm_k = np.cross(p_vectors, j_vectors) / vector_norm[..., None]
norm_offset_j = -np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, k_vectors)) / vector_norm
norm_offset_k = np.einsum('...i, ...i', p_vectors, np.cross(s_vectors, j_vectors)) / vector_norm
sparse_pointers_all = sparse_matrix[0].astype(np.int32)
sparse_indices_all = sparse_matrix[1].astype(np.int32)
sparse_weights_all = sparse_matrix[2].astype(numpy_float)
sparse_ranges_all = np.diff(sparse_pointers_all, axis=-1).astype(np.int32)
@cuda.jit(void(Array(float32, 4, 'C', readonly=True), cuda_float,
cuda_float, cuda_float, cuda_float[::1],
Array(int32, 1, 'C', readonly=True),
Array(float32, 1, 'C', readonly=True),
Array(int32, 1, 'C', readonly=True),
Array(int32, 1, 'C', readonly=True)), device=True,
lineinfo=True, fastmath=True, cache=True)
def bilinear_interpolation_projection(projection: np.ndarray[float],
r0: float,
r1: float,
r2: float,
accumulator: np.ndarray[float],
sparse_pointers: np.ndarray[int],
sparse_weights: np.ndarray[float],
sparse_indices: np.ndarray[int],
sparse_ranges: np.ndarray[int]):
if not (-1 <= r1 < max_j and -1 <= r2 < max_k):
return
# At edges, use nearest-neighbor interpolation.
if (0 <= r1 + 1 < max_j):
r1_weight = int32(1)
else:
r1_weight = int32(0)
if (0 <= r2 + 1 < max_k):
r2_weight = int32(1)
else:
r2_weight = int32(0)
if int32(r1) == -1:
r1_edge_weight = 1
else:
r1_edge_weight = 0
if int32(r2) == -1:
r2_edge_weight = 1
else:
r2_edge_weight = 0
y_weight = cuda_float((r1 - floor(r1)) * r1_weight) * (1 - r1_edge_weight)
z_weight = cuda_float((r2 - floor(r2)) * r2_weight) * (1 - r2_edge_weight)
x = int32(floor(r0))
y = int32(floor(r1) + r1_edge_weight)
y2 = y + r1_weight
z = int32(floor(r2) + r2_edge_weight)
z2 = z + r2_weight
t = cuda_x4_type((1 - z_weight) * (1 - y_weight),
(z_weight * y_weight),
((1 - z_weight) * y_weight),
((1 - y_weight) * z_weight))
for i in range(projection_channels):
pointer = sparse_pointers[i]
for j in range(sparse_ranges[i]):
index = sparse_indices[pointer + j]
weight = sparse_weights[pointer + j]
accumulator[index] += projection[x, y, z, i] * t.x * weight
accumulator[index] += projection[x, y2, z, i] * t.z * weight
accumulator[index] += projection[x, y, z2, i] * t.w * weight
accumulator[index] += projection[x, y2, z2, i] * t.y * weight
@cuda.jit(void(cuda_float[:, :, :, ::1], Array(float32, 4, 'C', readonly=True),
Array(int32, 2, 'C', readonly=True),
Array(float32, 2, 'C', readonly=True),
Array(int32, 2, 'C', readonly=True),
Array(int32, 2, 'C', readonly=True)),
lineinfo=True, fastmath=True, cache=True)
def john_transform_adjoint_inner(field: np.ndarray[float], projection: np.ndarray[float],
sparse_pointers, sparse_weights, sparse_indices,
sparse_ranges):
""" Performs the John transform of a field. Relies on a large number
of pre-defined constants outside the kernel body. """
# Indexing of volume coordinates.
x = cuda.threadIdx.x + threads_x * (cuda.blockIdx.x % (
(max_x + threads_x - 1) // threads_x))
y = cuda.threadIdx.y + threads_y * (cuda.blockIdx.x // (
(max_x + threads_x - 1) // threads_x))
if (x >= max_x) or (y >= max_y):
return
# Stride in z.
z = cuda.blockIdx.y * threads_z
z_acc = cuda.local.array(channels, cuda_float)
# Center of voxel and coordinate system.
fx = x - 0.5 * max_x + 0.5
fy = y - 0.5 * max_y + 0.5
steps = min(max(max_z - z, 0), threads_z)
# Compile time constants
start_index = 0
stop_index = max_p
norm_j_c = norm_j[start_index:stop_index]
norm_k_c = norm_k[start_index:stop_index]
norm_offset_j_c = norm_offset_j[start_index:stop_index]
norm_offset_k_c = norm_offset_k[start_index:stop_index]
for ii in range(steps):
fz = z - 0.5 * max_z + 0.5
for j in range(channels):
z_acc[j] = 0.0
for a in range(start_index, stop_index):
# Center with respect to projection.
fj = (norm_offset_j_c[a] + fx * norm_j_c[a][0] +
fy * norm_j_c[a][1] + fz * norm_j_c[a][2] - 0.5)
fk = (norm_offset_k_c[a] + fx * norm_k_c[a][0] +
fy * norm_k_c[a][1] + fz * norm_k_c[a][2] - 0.5)
bilinear_interpolation_projection(projection,
a, fj, fk, z_acc,
sparse_pointers[a],
sparse_weights[a],
sparse_indices[a],
sparse_ranges[a])
for i in range(channels):
field[x, y, z, i] = z_acc[i]
z += 1
bpg = (((max_x + threads_x - 1) // threads_x) *
((max_y + threads_y - 1) // threads_y),
((max_z + threads_z - 1) // threads_z))
tpb = (threads_x, threads_y)
john_transform_adjoint_grid = john_transform_adjoint_inner[bpg, tpb]
sparse_pointers = cuda.to_device(sparse_pointers_all)
sparse_weights = cuda.to_device(sparse_weights_all)
sparse_indices = cuda.to_device(sparse_indices_all)
sparse_ranges = cuda.to_device(sparse_ranges_all)
def transform_with_transfer(field: np.ndarray[float],
projections: np.ndarray[float]) -> np.ndarray[float]:
if _is_cuda_array(field):
assert field.dtype == 'float32'
device_field = cuda.as_cuda_array(field)
else:
assert field.dtype == np.float32
device_field = cuda.to_device(field)
if _is_cuda_array(projections):
assert projections.dtype == 'float32'
device_projections = cuda.as_cuda_array(projections)
transfer_field = False
else:
assert projections.dtype == np.float32
device_projections = cuda.to_device(projections)
transfer_field = True
john_transform_adjoint_grid(device_field,
device_projections,
sparse_pointers,
sparse_weights,
sparse_indices,
sparse_ranges)
if transfer_field:
return device_field.copy_to_host()
return device_field
return transform_with_transfer