Source code for mumott.core.hashing

import logging
import hashlib
from _hashlib import HASH

import numpy as np

from numpy.typing import NDArray

logger = logging.getLogger(__name__)


def _cf_hasher(hashing_function, item) -> None:
    """ Internal method for hashing floats and complex number,
        or arrays of them. """
    if not isinstance(item, np.ndarray):
        item = np.array(item)
    item = item.ravel()
    if item.dtype.kind == 'c':
        # frexp only works on reals
        item = np.concatenate((item.real, item.imag))
    mantissa, exponent = np.frexp(item)
    edge_cases = np.isclose(abs(mantissa), 1.0, atol=1e-6, rtol=1e-6)
    mantissa[edge_cases] = np.sign(mantissa[edge_cases]) * 0.5
    exponent[edge_cases] = exponent[edge_cases] + 1
    # Round mantissa for consistency
    hashing_function.update(mantissa.round(5))
    hashing_function.update(exponent)


def _array_hasher(hashing_function: HASH, item: NDArray) -> None:
    """ Internal method for hashing arrays, lists and tuples. """
    if type(item) in (list, tuple):
        item = np.array(item)
    if item.dtype.kind in ('v'):
        hashing_function.update(item)
    # kind is bytes, int, uint, string, unicode
    if item.dtype.kind in ('biuSU'):
        hashing_function.update(np.char.encode(item.astype(str), 'utf-8'))
    # kind is float or complex
    elif item.dtype.kind in ('fc'):
        _cf_hasher(hashing_function, item)
    # unknown data, possibly ragged array etc
    else:
        raise TypeError(f'Hash of dtype `object` is not deterministic, cannot hash {item}')


def _item_hasher(hashing_function: HASH, item) -> None:
    """ Internal method for hashing floats, integers and strings. """
    if item is None:
        return
    if np.array(item).dtype.kind == 'v':
        hashing_function.update(item)
    if np.array(item).dtype.kind in ('biuSU'):
        # Cast all ints, strings, etc to string and encode
        hashing_function.update(str(item).encode('utf-8'))
    elif np.array(item).dtype.kind in ('fc'):
        _cf_hasher(hashing_function, item)
    elif np.array(item).dtype.kind == 'O':
        raise TypeError(f'Cannot hash unknown object: {item}')


def _dict_hasher(hashing_function: HASH, item) -> None:
    """ Internal method for hashing dictionaries. """
    for key, value in item.items():
        hashing_function.update(key.encode('utf-8'))
        if isinstance(value, np.ndarray) or type(value) in (list, tuple):
            _array_hasher(hashing_function, value)
        else:
            _item_hasher(hashing_function, value)


[docs]def list_to_hash(list_to_hash: list, hashing_algorithm: str = 'blake2b') -> str:
    """
    Function which takes a list containing a set of objects and automatically
    generates a deterministic hash for them.

    Parameters
    ----------
    list_to_hash
        List of a set of objects of various types, see `notes` for a complete list.
    hashing_algorithm
        The hashing algorithm to use. Can be any algorithm name in
        ``hashlib.algorithms_available``. Default is ``'blake2b'``.

    Example
    -------
    The following code snippets illustrate hashing lists that will work, and ones
    that will not work.

    Works: A list of an integer, an array, a dictionary with valid types, and a None.

    >>> from mumott.core.hashing import list_to_hash
    >>> print(list_to_hash([1, np.array((1, 3, 5)), dict(val=1, string='abc'), None]))
    2a949c...

    Does not work: an array containing a ``None``, due to the ``dtype`` being ``object``.

    >>> print(list_to_hash([np.array([None])]))
    Traceback (most recent call last):
    ...
    TypeError: Hash of dtype `object` is not deterministic, cannot hash [None]

    Does not work: a generator expression, which is an unknown object.

    >>> print(list_to_hash([(a for a in [1, 2, 3])]))
    Traceback (most recent call last):
    ...
    TypeError: Cannot hash unknown object: <generator object...

    Notes
    -----
    ``float``-type objects are rounded to five significant digits in the mantissa before hashing.
    This is necessary to obtain semi-deterministic hashes that obey a subset of fuzzy equality
    for float comparison. There are edge cases where equality can fail due to
    rounding errors, but these should be extremely rare.

    Supported entry types in :attr:`list_to_hash`:
        ``int``
            Cast to string.
            Works along with similar ``numpy`` types.
        ``float``
            Mantissa rounded to five significant digits and concatenated with exponent.
            Works along with similar ``numpy`` types.
        ``complex``
            Real and imaginary parts concatenated and treated like ``float``.
            Works along with similar ``numpy`` types.
        ``str``
            Automatically given ``'utf-8'`` encoding.
        ``bytes``
            Cast to string.
        ``None``
            Ignored.
        ``np.ndarray``
            Provided ``dtype`` is not ``object``, hence arrays of ``None`` are not allowed.
        ``list``, ``tuple``
            Provided they can be cast to allowed, i.e. non-ragged ``np.ndarray``
        ``dict``
            Assuming entries are allowed types. Keys and entries are concatenated.
            If an entry is ``None``, the key is added to the hash while the entry is ignored.
    """
    hashing_function = hashlib.new(hashing_algorithm)
    for item in list_to_hash:
        if isinstance(item, np.ndarray) or type(item) in (list, tuple):
            _array_hasher(hashing_function, item)
        elif type(item) is dict:
            _dict_hasher(hashing_function, item)
        else:
            _item_hasher(hashing_function, item)
    return hashing_function.hexdigest()