Source code for mumott.core.cuda_kernels

import numba
from numba import cuda


[docs]def cuda_weighted_difference(shape: tuple[int]):
    """ Compiles a CUDA kernel for a 'weighted difference', i.e.
    ``a = (a - b) * c``. For example, ``a`` could be an approximation of ``b``,
    and ``c`` could be the weight to assign to the residual of ``a`` and ``b``.

    Parameters
    ----------
    shape
        The shape of ``a``, ``b``, and ``c``.

    Returns
    -------
        A CUDA callable that takes 3 inputs, ``data``, ``value``,
        and ``weights``, and stores the output in ``value``.
        The difference is computed as ``((value * weights) - data * weights)``.
    """
    tpb = (4, 4, 4)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1,
           shape[2] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.float32[:, :, :, ::1],
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def weighted_difference(data, value, weights):
        i, j, k = cuda.grid(3)
        if (i < shape[0]) and (j < shape[1]) and (k < shape[2]):
            for h in range(shape[3]):
                # Use intrinsic fused multiply-add
                d = -data[i, j, k, h] * weights[i, j, k, h]
                value[i, j, k, h] = cuda.fma(value[i, j, k, h], weights[i, j, k, h], d)

    return weighted_difference[bpg, tpb]


[docs]def cuda_weighted_sign(shape: tuple[int], delta: float = 0.):
    """ Compiles a CUDA kernel for a 'weighted sign', i.e.
    ``a = sgn(a - b) * c``. For example, ``a`` could be an approximation of ``b``,
    and ``c`` could be the weight to assign to the residual of ``a`` and ``b``.

    If ``delta`` is set to be greater than 0, then this function will return
    ``(a - b) * c / (2 * delta)`` if ``abs(a - b) < delta``.

    Parameters
    ----------
    shape
        The shape of ``a``, ``b``, and ``c``.
    delta
        Threshold at which to switch from sign to actual difference.

    Returns
    -------
        A CUDA callable that takes 3 inputs, ``data``, ``value``,
        and ``weights``, and stores the output in ``value``.
    """
    tpb = (4, 4, 4)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1,
           shape[2] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.float32[:, :, :, ::1],
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def weighted_difference(data, value, weights):
        i, j, k = cuda.grid(3)
        if (i < shape[0]) and (j < shape[1]) and (k < shape[2]):
            for h in range(shape[3]):
                # Use intrinsic fused multiply-add
                d = value[i, j, k, h] - data[i, j, k, h]
                ad = abs(d)
                if (ad < delta) and (delta > 0):
                    scale = weights[i, j, k, h] * ad / (2 * delta)
                else:
                    scale = weights[i, j, k, h]
                value[i, j, k, h] = cuda.libdevice.copysignf(scale, d)

    return weighted_difference[bpg, tpb]


[docs]def cuda_scaled_difference(shape: tuple[int]):
    """ Compiles a CUDA kernel for a 'scaled difference', i.e.,
    ``a -= b * c`` for 3 4-dimensional arrays, e.g. a data, gradient, and preconditioner
    array.

    Parameters
    ----------
    shape
        The shape of ``a`` and ``b`` as a 4-tuple.

    Returns
    -------
        A CUDA callable which takes 3 inputs, a ``gradient``,
        ``value``, and ``scaling``. The output is stored in
        ``value``. All inputs must be 4D arrays with shape ``shape``.
    """
    tpb = (4, 4, 4)
    bpg = (shape[1] // tpb[0] + 1,
           shape[2] // tpb[1] + 1,
           shape[3] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.float32[:, :, :, ::1],
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def scaled_difference(gradient, value, scaling):
        i, j, k = cuda.grid(3)
        if (i < shape[1]) and (j < shape[2]) and (k < shape[3]):
            for h in range(shape[0]):
                value[h, i, j, k] -= gradient[h, i, j, k] * scaling[h, i, j, k]

    return scaled_difference[bpg, tpb]


[docs]def cuda_sum(shape: tuple[int]):
    """ Computes a CUDA kernel for the summation of 2 4D arrays,
    e.g. 2 gradients.

    Parameters
    ----------
    shape
        A 4-tuple of the shape of the two gradients.

    Returns
    -------
        A CUDA callable which takes an ``old_gradient`` input/output,
        and a ``new_gradient`` input. The sum is stored in-place in
        ``old_gradient``.

    """
    tpb = (4, 4, 4)
    bpg = (shape[1] // tpb[0] + 1,
           shape[2] // tpb[1] + 1,
           shape[3] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=False),
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def cuda_sum(old_gradient, new_gradient):
        i, j, k = cuda.grid(3)
        if (i < shape[1]) and (j < shape[2]) and (k < shape[3]):
            for h in range(shape[0]):
                old_gradient[h, i, j, k] += new_gradient[h, i, j, k]

    return cuda_sum[bpg, tpb]


[docs]def cuda_difference(shape: tuple[int]):
    """ Computes a CUDA kernel for the difference of 2 4D arrays,
    e.g. a gradient and a value

    Parameters
    ----------
    shape
        A 4-tuple of the shape of the two gradients.

    Returns
    -------
        A CUDA callable which takes an ``old_gradient`` input/output,
        and a ``new_gradient`` input. The sum is stored in-place in
        ``old_gradient``.

    """
    tpb = (4, 4, 4)
    bpg = (shape[1] // tpb[0] + 1,
           shape[2] // tpb[1] + 1,
           shape[3] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=False),
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def cuda_difference(value, gradient):
        i, j, k = cuda.grid(3)
        if (i < shape[1]) and (j < shape[2]) and (k < shape[3]):
            for h in range(shape[0]):
                value[h, i, j, k] -= gradient[h, i, j, k]

    return cuda_difference[bpg, tpb]


[docs]def cuda_framewise_contraction(shape: tuple[int], rows: int, columns: int):
    """ Computes a CUDA kernel for the framewise contraction of a tensor field
    and a matrix stack: ``out[i, j, k, g] = sum_h(field[i, j, k, h], matrix[i, g, h])``.
    In ``numpy.einsum`` notation, this would be ``'ijkh, igh -> ijkg'``.

    Parameters
    ----------
    shape
        A 3-tuple giving the shapes of the first three dimensions of the field (``(i, j, k)`` dimensions).
    rows
        The number of rows in the matrix/output vector length (``g`` dimension).
    columns
        The number of columns in the matrix/input vector length (``h`` dimension).

    Returns
    -------
        A CUDA callable which takes ``field`` and ``matrix`` inputs,
        and ``out`` output.
    """
    tpb = (8, 8)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.types.Array(numba.float32, 3, 'C', readonly=True),
                         numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def cuda_framewise_contraction(field, matrix, out):
        i, j = cuda.grid(2)
        if (i < shape[0]) and (j < shape[1]):
            temp = cuda.local.array(rows, numba.float32)
            for k in range(shape[2]):
                for g in range(rows):
                    temp[g] = 0.
                for h in range(columns):
                    tf = field[i, j, k, h]
                    for g in range(rows):
                        temp[g] += matrix[i, g, h] * tf
                for g in range(rows):
                    out[i, j, k, g] = temp[g]

    return cuda_framewise_contraction[bpg, tpb]


[docs]def cuda_framewise_contraction_adjoint(shape: tuple[int], rows: int, columns: int):
    """ Computes a CUDA kernel for the adjoint of the framewise
    contraction of a tensor field and a matrix stack:
    ``out[i, j, k, h] = sum_h(field[i, j, k, g], matrix[i, g, h])``.
    In ``numpy.einsum`` notation, this would be ``'ijkg, igh -> ijkh'``.

    Parameters
    ----------
    shape
        A 3-tuple giving the shapes of the first three dimensions of the field (``(i, j, k)`` dimensions).
    rows
        The number of rows in the matrix/input vector length (``g`` dimension).
    columns
        The number of columns in the matrix/output vector length (``h`` dimension).

    Returns
    -------
        A CUDA callable which takes ``field`` and ``matrix`` inputs,
        and ``out`` output.
    """
    tpb = (8, 8)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.types.Array(numba.float32, 3, 'C', readonly=True),
                         numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def cuda_framewise_contraction_adjoint(field, matrix, out):
        i, j = cuda.grid(2)
        if (i < shape[0]) and (j < shape[1]):
            temp = cuda.local.array(columns, numba.float32)
            for k in range(shape[2]):
                for h in range(columns):
                    temp[h] = 0.
                for g in range(rows):
                    tg = field[i, j, k, g]
                    for h in range(columns):
                        temp[h] += matrix[i, g, h] * tg
                for h in range(columns):
                    out[i, j, k, h] = temp[h]

    return cuda_framewise_contraction_adjoint[bpg, tpb]


[docs]def cuda_rescale_array(shape: tuple[int]):
    """ Compiles a CUDA kernel for the rescaling of a gradient with a momentum term,
    or similar rescaling of a 4-dimensional array with another 4-dimensional array.

    Parameters
    ----------
    shape
        The shape of the coefficients to which the gradient will ultimately be applied.

    Returns
    -------
        A compiled CUDA callable which takes one array, an input/output
        ``gradient``.
    """

    tpb = (4, 4, 4)
    bpg = (shape[1] // tpb[0] + 1,
           shape[2] // tpb[1] + 1,
           shape[3] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=False),
                         numba.types.Array(numba.float32, 4, 'C', readonly=True)))
    def scale(gradient, scaling):
        i, j, k = cuda.grid(3)
        if (i < shape[1]) and (j < shape[2]) and (k < shape[3]):
            for h in range(shape[0]):
                gradient[h, i, j, k] *= scaling[h, i, j, k]

    return scale[bpg, tpb]


[docs]def cuda_lower_bound(shape: tuple[int], lower_bound: float = 0.):
    """ Compiles a CUDA kernel for the enforcement of a lower bound to a 4-dimensional field.
    The computation is ``field[i, j, k, h] = max(field[i, j, k, h], lower_bound)``.

    Parameters
    ----------
    shape
        The shape of the coefficients to threshold with the lower bound.

    Returns
    -------
        A compiled CUDA callable which takes one array, an input/output
        ``field``.
    """

    tpb = (4, 4, 4)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1,
           shape[2] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def threshold(field):
        i, j, k = cuda.grid(3)
        lb = numba.float32(lower_bound)
        if (i < shape[0]) and (j < shape[1]) and (k < shape[2]):
            for h in range(shape[3]):
                field[i, j, k, h] = cuda.libdevice.fmax(field[i, j, k, h], lb)

    return threshold[bpg, tpb]


[docs]def cuda_rescale(shape: tuple[int], momentum: float = 0.9):
    """ Compiles a CUDA kernel for the rescaling of a gradient with a momentum term,
    or similar rescaling of a 4-dimensional array with a scalar.

    Parameters
    ----------
    shape
        The shape of the coefficients to which the gradient will ultimately be applied.
    momentum
        The momentum weight, from 0 to 1. Default is ``0.9``.

    Returns
    -------
        A compiled CUDA callable which takes one array, an input/output
        ``gradient``.
    """

    tpb = (4, 4, 4)
    bpg = (shape[1] // tpb[0] + 1,
           shape[2] // tpb[1] + 1,
           shape[3] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def scale(gradient):
        i, j, k = cuda.grid(3)
        scaling = numba.float32(momentum)
        if (i < shape[1]) and (j < shape[2]) and (k < shape[3]):
            for h in range(shape[0]):
                gradient[h, i, j, k] *= scaling

    return scale[bpg, tpb]


[docs]def cuda_l1_gradient(shape: tuple[int], weight: float = 1e-4):
    """ Compiles a CUDA kernel for the gradient of an L1 regularizer.

    Parameters
    ----------
    shape
        The shape of the coefficients to which the gradient will ultimately be applied.
    weight
        The weight of the L1 gradient.

    Returns
    -------
        A compiled CUDA callable which takes two arrays, an input
        ``coefficients`` array and an output ``gradient`` array, both of
        shape ``shape`` and dtype ``float32``.
    """
    tpb = (4, 4, 4)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1,
           shape[2] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def l1_gradient(coefficients, gradient):
        i, j, k = cuda.grid(3)
        scale = numba.float32(weight)
        if (i < shape[0]) and (j < shape[1]) and (k < shape[2]):
            for h in range(shape[3]):
                # Use CUDA intrinsic for scaled sign function.
                gradient[i, j, k, h] += cuda.libdevice.copysignf(scale, coefficients[i, j, k, h])

    return l1_gradient[bpg, tpb]


[docs]def cuda_tv_gradient(shape: tuple[int], weight: float = 1e-4):
    """ Compiles a CUDA kernel for the gradient of a Total Variation regularizer.
    Gradient values at edges are sets to 0.

    Parameters
    ----------
    shape
        The shape of the coefficients to which the gradient will ultimately be applied.
    weight
        The weight of the TV gradient.

    Returns
    -------
        A compiled CUDA callable which takes two arrays, an input
        ``coefficients`` array and an output ``gradient`` array, both of
        shape ``shape`` and dtype ``float32``. The ``gradient`` array
        will have the value of the TV gradient added to it.
    """
    tpb = (4, 4, 4)
    bpg = (shape[0] // tpb[0] + 1,
           shape[1] // tpb[1] + 1,
           shape[2] // tpb[2] + 1)

    @cuda.jit(numba.void(numba.types.Array(numba.float32, 4, 'C', readonly=True),
                         numba.types.Array(numba.float32, 4, 'C', readonly=False)))
    def tv_gradient(coefficients, gradient):
        i, j, k = cuda.grid(3)
        scale = numba.float32(weight)
        if (i < shape[0]) and (j < shape[1]) and (k < shape[2]):
            # Zero edges to simplify edge handling while maintaining edge conditions
            if (((i == 0) or (i == shape[0] - 1) or
                 (j == 0) or (j == shape[1] - 1)) or (k == 0) or (k == shape[2] - 1)):
                for h in range(shape[3]):
                    gradient[i, j, k, h] = 0.
                return
            else:
                for h in range(shape[3]):
                    numerator = 6 * coefficients[i, j, k, h]
                    denominator = 0
                    numerator -= coefficients[i - 1, j, k, h]
                    numerator -= coefficients[i + 1, j, k, h]
                    numerator -= coefficients[i, j - 1, k, h]
                    numerator -= coefficients[i, j + 1, k, h]
                    numerator -= coefficients[i, j, k - 1, h]
                    numerator -= coefficients[i, j, k + 1, h]
                    denominator += (coefficients[i, j, k, h] - coefficients[i - 1, j, k, h]) ** 2
                    denominator += (coefficients[i, j, k, h] - coefficients[i + 1, j, k, h]) ** 2
                    denominator += (coefficients[i, j, k, h] - coefficients[i, j - 1, k, h]) ** 2
                    denominator += (coefficients[i, j, k, h] - coefficients[i, j + 1, k, h]) ** 2
                    denominator += (coefficients[i, j, k, h] - coefficients[i, j, k - 1, h]) ** 2
                    denominator += (coefficients[i, j, k, h] - coefficients[i, j, k + 1, h]) ** 2
                    if denominator > 0.:
                        # Use intrinsic for square root
                        gradient[i, j, k, h] += \
                                scale * numerator / cuda.libdevice.fsqrt_rn(denominator)

    return tv_gradient[bpg, tpb]