Coverage for local_installation_linux/mumott/core/hashing.py: 94%
54 statements
« prev ^ index » next coverage.py v7.3.2, created at 2025-05-05 21:21 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2025-05-05 21:21 +0000
1import logging
2import hashlib
3from _hashlib import HASH
5import numpy as np
7from numpy.typing import NDArray
9logger = logging.getLogger(__name__)
12def _cf_hasher(hashing_function, item) -> None:
13 """ Internal method for hashing floats and complex number,
14 or arrays of them. """
15 if not isinstance(item, np.ndarray):
16 item = np.array(item)
17 item = item.ravel()
18 if item.dtype.kind == 'c':
19 # frexp only works on reals
20 item = np.concatenate((item.real, item.imag))
21 mantissa, exponent = np.frexp(item)
22 edge_cases = np.isclose(abs(mantissa), 1.0, atol=1e-6, rtol=1e-6)
23 mantissa[edge_cases] = np.sign(mantissa[edge_cases]) * 0.5
24 exponent[edge_cases] = exponent[edge_cases] + 1
25 # Round mantissa for consistency
26 hashing_function.update(mantissa.round(5))
27 hashing_function.update(exponent)
30def _array_hasher(hashing_function: HASH, item: NDArray) -> None:
31 """ Internal method for hashing arrays, lists and tuples. """
32 if type(item) in (list, tuple):
33 item = np.array(item)
34 if item.dtype.kind in ('v'): 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true
35 hashing_function.update(item)
36 # kind is bytes, int, uint, string, unicode
37 if item.dtype.kind in ('biuSU'):
38 hashing_function.update(np.char.encode(item.astype(str), 'utf-8'))
39 # kind is float or complex
40 elif item.dtype.kind in ('fc'):
41 _cf_hasher(hashing_function, item)
42 # unknown data, possibly ragged array etc
43 else:
44 raise TypeError(f'Hash of dtype `object` is not deterministic, cannot hash {item}')
47def _item_hasher(hashing_function: HASH, item) -> None:
48 """ Internal method for hashing floats, integers and strings. """
49 if item is None:
50 return
51 if np.array(item).dtype.kind == 'v': 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 hashing_function.update(item)
53 if np.array(item).dtype.kind in ('biuSU'):
54 # Cast all ints, strings, etc to string and encode
55 hashing_function.update(str(item).encode('utf-8'))
56 elif np.array(item).dtype.kind in ('fc'):
57 _cf_hasher(hashing_function, item)
58 elif np.array(item).dtype.kind == 'O': 58 ↛ exitline 58 didn't return from function '_item_hasher', because the condition on line 58 was never false
59 raise TypeError(f'Cannot hash unknown object: {item}')
62def _dict_hasher(hashing_function: HASH, item) -> None:
63 """ Internal method for hashing dictionaries. """
64 for key, value in item.items():
65 hashing_function.update(key.encode('utf-8'))
66 if isinstance(value, np.ndarray) or type(value) in (list, tuple):
67 _array_hasher(hashing_function, value)
68 else:
69 _item_hasher(hashing_function, value)
72def list_to_hash(list_to_hash: list, hashing_algorithm: str = 'blake2b') -> str:
73 """
74 Function which takes a list containing a set of objects and automatically
75 generates a deterministic hash for them.
77 Parameters
78 ----------
79 list_to_hash
80 List of a set of objects of various types, see `notes` for a complete list.
81 hashing_algorithm
82 The hashing algorithm to use. Can be any algorithm name in
83 ``hashlib.algorithms_available``. Default is ``'blake2b'``.
85 Example
86 -------
87 The following code snippets illustrate hashing lists that will work, and ones
88 that will not work.
90 Works: A list of an integer, an array, a dictionary with valid types, and a None.
92 >>> from mumott.core.hashing import list_to_hash
93 >>> print(list_to_hash([1, np.array((1, 3, 5)), dict(val=1, string='abc'), None]))
94 2a949c...
96 Does not work: an array containing a ``None``, due to the ``dtype`` being ``object``.
98 >>> print(list_to_hash([np.array([None])]))
99 Traceback (most recent call last):
100 ...
101 TypeError: Hash of dtype `object` is not deterministic, cannot hash [None]
103 Does not work: a generator expression, which is an unknown object.
105 >>> print(list_to_hash([(a for a in [1, 2, 3])]))
106 Traceback (most recent call last):
107 ...
108 TypeError: Cannot hash unknown object: <generator object...
110 Notes
111 -----
112 ``float``-type objects are rounded to five significant digits in the mantissa before hashing.
113 This is necessary to obtain semi-deterministic hashes that obey a subset of fuzzy equality
114 for float comparison. There are edge cases where equality can fail due to
115 rounding errors, but these should be extremely rare.
117 Supported entry types in :attr:`list_to_hash`:
118 ``int``
119 Cast to string.
120 Works along with similar ``numpy`` types.
121 ``float``
122 Mantissa rounded to five significant digits and concatenated with exponent.
123 Works along with similar ``numpy`` types.
124 ``complex``
125 Real and imaginary parts concatenated and treated like ``float``.
126 Works along with similar ``numpy`` types.
127 ``str``
128 Automatically given ``'utf-8'`` encoding.
129 ``bytes``
130 Cast to string.
131 ``None``
132 Ignored.
133 ``np.ndarray``
134 Provided ``dtype`` is not ``object``, hence arrays of ``None`` are not allowed.
135 ``list``, ``tuple``
136 Provided they can be cast to allowed, i.e. non-ragged ``np.ndarray``
137 ``dict``
138 Assuming entries are allowed types. Keys and entries are concatenated.
139 If an entry is ``None``, the key is added to the hash while the entry is ignored.
140 """
141 hashing_function = hashlib.new(hashing_algorithm)
142 for item in list_to_hash:
143 if isinstance(item, np.ndarray) or type(item) in (list, tuple):
144 _array_hasher(hashing_function, item)
145 elif type(item) is dict:
146 _dict_hasher(hashing_function, item)
147 else:
148 _item_hasher(hashing_function, item)
149 return hashing_function.hexdigest()