Source code for mumott.core.hashing

import logging
import hashlib
from _hashlib import HASH

import numpy as np

from numpy.typing import NDArray

logger = logging.getLogger(__name__)


def _cf_hasher(hashing_function, item) -> None:
    """ Internal method for hashing floats and complex number,
        or arrays of them. """
    if not isinstance(item, np.ndarray):
        item = np.array(item)
    item = item.ravel()
    if item.dtype.kind == 'c':
        # frexp only works on reals
        item = np.concatenate((item.real, item.imag))
    mantissa, exponent = np.frexp(item)
    edge_cases = np.isclose(abs(mantissa), 1.0, atol=1e-6, rtol=1e-6)
    mantissa[edge_cases] = np.sign(mantissa[edge_cases]) * 0.5
    exponent[edge_cases] = exponent[edge_cases] + 1
    # Round mantissa for consistency
    hashing_function.update(mantissa.round(5))
    hashing_function.update(exponent)


def _array_hasher(hashing_function: HASH, item: NDArray) -> None:
    """ Internal method for hashing arrays, lists and tuples. """
    if type(item) in (list, tuple):
        item = np.array(item)
    if item.dtype.kind in ('v'):
        hashing_function.update(item)
    # kind is bytes, int, uint, string, unicode
    if item.dtype.kind in ('biuSU'):
        hashing_function.update(np.char.encode(item.astype(str), 'utf-8'))
    # kind is float or complex
    elif item.dtype.kind in ('fc'):
        _cf_hasher(hashing_function, item)
    # unknown data, possibly ragged array etc
    else:
        raise TypeError(f'Hash of dtype `object` is not deterministic, cannot hash {item}')


def _item_hasher(hashing_function: HASH, item) -> None:
    """ Internal method for hashing floats, integers and strings. """
    if item is None:
        return
    if np.array(item).dtype.kind == 'v':
        hashing_function.update(item)
    if np.array(item).dtype.kind in ('biuSU'):
        # Cast all ints, strings, etc to string and encode
        hashing_function.update(str(item).encode('utf-8'))
    elif np.array(item).dtype.kind in ('fc'):
        _cf_hasher(hashing_function, item)
    elif np.array(item).dtype.kind == 'O':
        raise TypeError(f'Cannot hash unknown object: {item}')


def _dict_hasher(hashing_function: HASH, item) -> None:
    """ Internal method for hashing dictionaries. """
    for key, value in item.items():
        hashing_function.update(key.encode('utf-8'))
        if isinstance(value, np.ndarray) or type(value) in (list, tuple):
            _array_hasher(hashing_function, value)
        else:
            _item_hasher(hashing_function, value)


[docs]def list_to_hash(list_to_hash: list, hashing_algorithm: str = 'blake2b') -> str: """ Function which takes a list containing a set of objects and automatically generates a deterministic hash for them. Parameters ---------- list_to_hash List of a set of objects of various types, see `notes` for a complete list. hashing_algorithm The hashing algorithm to use. Can be any algorithm name in ``hashlib.algorithms_available``. Default is ``'blake2b'``. Example ------- The following code snippets illustrate hashing lists that will work, and ones that will not work. Works: A list of an integer, an array, a dictionary with valid types, and a None. >>> from mumott.core.hashing import list_to_hash >>> print(list_to_hash([1, np.array((1, 3, 5)), dict(val=1, string='abc'), None])) 2a949c... Does not work: an array containing a ``None``, due to the ``dtype`` being ``object``. >>> print(list_to_hash([np.array([None])])) Traceback (most recent call last): ... TypeError: Hash of dtype `object` is not deterministic, cannot hash [None] Does not work: a generator expression, which is an unknown object. >>> print(list_to_hash([(a for a in [1, 2, 3])])) Traceback (most recent call last): ... TypeError: Cannot hash unknown object: <generator object... Notes ----- ``float``-type objects are rounded to five significant digits in the mantissa before hashing. This is necessary to obtain semi-deterministic hashes that obey a subset of fuzzy equality for float comparison. There are edge cases where equality can fail due to rounding errors, but these should be extremely rare. Supported entry types in :attr:`list_to_hash`: ``int`` Cast to string. Works along with similar ``numpy`` types. ``float`` Mantissa rounded to five significant digits and concatenated with exponent. Works along with similar ``numpy`` types. ``complex`` Real and imaginary parts concatenated and treated like ``float``. Works along with similar ``numpy`` types. ``str`` Automatically given ``'utf-8'`` encoding. ``bytes`` Cast to string. ``None`` Ignored. ``np.ndarray`` Provided ``dtype`` is not ``object``, hence arrays of ``None`` are not allowed. ``list``, ``tuple`` Provided they can be cast to allowed, i.e. non-ragged ``np.ndarray`` ``dict`` Assuming entries are allowed types. Keys and entries are concatenated. If an entry is ``None``, the key is added to the hash while the entry is ignored. """ hashing_function = hashlib.new(hashing_algorithm) for item in list_to_hash: if isinstance(item, np.ndarray) or type(item) in (list, tuple): _array_hasher(hashing_function, item) elif type(item) is dict: _dict_hasher(hashing_function, item) else: _item_hasher(hashing_function, item) return hashing_function.hexdigest()