Coverage for local_installation_linux/mumott/core/hashing.py: 94%

54 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-08-11 23:08 +0000

1import logging 

2import hashlib 

3from _hashlib import HASH 

4 

5import numpy as np 

6 

7from numpy.typing import NDArray 

8 

9logger = logging.getLogger(__name__) 

10 

11 

12def _cf_hasher(hashing_function, item) -> None: 

13 """ Internal method for hashing floats and complex number, 

14 or arrays of them. """ 

15 if not isinstance(item, np.ndarray): 

16 item = np.array(item) 

17 item = item.ravel() 

18 if item.dtype.kind == 'c': 

19 # frexp only works on reals 

20 item = np.concatenate((item.real, item.imag)) 

21 mantissa, exponent = np.frexp(item) 

22 edge_cases = np.isclose(abs(mantissa), 1.0, atol=1e-6, rtol=1e-6) 

23 mantissa[edge_cases] = np.sign(mantissa[edge_cases]) * 0.5 

24 exponent[edge_cases] = exponent[edge_cases] + 1 

25 # Round mantissa for consistency 

26 hashing_function.update(mantissa.round(5)) 

27 hashing_function.update(exponent) 

28 

29 

30def _array_hasher(hashing_function: HASH, item: NDArray) -> None: 

31 """ Internal method for hashing arrays, lists and tuples. """ 

32 if type(item) in (list, tuple): 

33 item = np.array(item) 

34 if item.dtype.kind in ('v'): 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true

35 hashing_function.update(item) 

36 # kind is bytes, int, uint, string, unicode 

37 if item.dtype.kind in ('biuSU'): 

38 hashing_function.update(np.char.encode(item.astype(str), 'utf-8')) 

39 # kind is float or complex 

40 elif item.dtype.kind in ('fc'): 

41 _cf_hasher(hashing_function, item) 

42 # unknown data, possibly ragged array etc 

43 else: 

44 raise TypeError(f'Hash of dtype `object` is not deterministic, cannot hash {item}') 

45 

46 

47def _item_hasher(hashing_function: HASH, item) -> None: 

48 """ Internal method for hashing floats, integers and strings. """ 

49 if item is None: 

50 return 

51 if np.array(item).dtype.kind == 'v': 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 hashing_function.update(item) 

53 if np.array(item).dtype.kind in ('biuSU'): 

54 # Cast all ints, strings, etc to string and encode 

55 hashing_function.update(str(item).encode('utf-8')) 

56 elif np.array(item).dtype.kind in ('fc'): 

57 _cf_hasher(hashing_function, item) 

58 elif np.array(item).dtype.kind == 'O': 58 ↛ exitline 58 didn't return from function '_item_hasher', because the condition on line 58 was never false

59 raise TypeError(f'Cannot hash unknown object: {item}') 

60 

61 

62def _dict_hasher(hashing_function: HASH, item) -> None: 

63 """ Internal method for hashing dictionaries. """ 

64 for key, value in item.items(): 

65 hashing_function.update(key.encode('utf-8')) 

66 if isinstance(value, np.ndarray) or type(value) in (list, tuple): 

67 _array_hasher(hashing_function, value) 

68 else: 

69 _item_hasher(hashing_function, value) 

70 

71 

72def list_to_hash(list_to_hash: list, hashing_algorithm: str = 'blake2b') -> str: 

73 """ 

74 Function which takes a list containing a set of objects and automatically 

75 generates a deterministic hash for them. 

76 

77 Parameters 

78 ---------- 

79 list_to_hash 

80 List of a set of objects of various types, see `notes` for a complete list. 

81 hashing_algorithm 

82 The hashing algorithm to use. Can be any algorithm name in 

83 ``hashlib.algorithms_available``. Default is ``'blake2b'``. 

84 

85 Example 

86 ------- 

87 The following code snippets illustrate hashing lists that will work, and ones 

88 that will not work. 

89 

90 Works: A list of an integer, an array, a dictionary with valid types, and a None. 

91 

92 >>> from mumott.core.hashing import list_to_hash 

93 >>> print(list_to_hash([1, np.array((1, 3, 5)), dict(val=1, string='abc'), None])) 

94 2a949c... 

95 

96 Does not work: an array containing a ``None``, due to the ``dtype`` being ``object``. 

97 

98 >>> print(list_to_hash([np.array([None])])) 

99 Traceback (most recent call last): 

100 ... 

101 TypeError: Hash of dtype `object` is not deterministic, cannot hash [None] 

102 

103 Does not work: a generator expression, which is an unknown object. 

104 

105 >>> print(list_to_hash([(a for a in [1, 2, 3])])) 

106 Traceback (most recent call last): 

107 ... 

108 TypeError: Cannot hash unknown object: <generator object... 

109 

110 Notes 

111 ----- 

112 ``float``-type objects are rounded to five significant digits in the mantissa before hashing. 

113 This is necessary to obtain semi-deterministic hashes that obey a subset of fuzzy equality 

114 for float comparison. There are edge cases where equality can fail due to 

115 rounding errors, but these should be extremely rare. 

116 

117 Supported entry types in :attr:`list_to_hash`: 

118 ``int`` 

119 Cast to string. 

120 Works along with similar ``numpy`` types. 

121 ``float`` 

122 Mantissa rounded to five significant digits and concatenated with exponent. 

123 Works along with similar ``numpy`` types. 

124 ``complex`` 

125 Real and imaginary parts concatenated and treated like ``float``. 

126 Works along with similar ``numpy`` types. 

127 ``str`` 

128 Automatically given ``'utf-8'`` encoding. 

129 ``bytes`` 

130 Cast to string. 

131 ``None`` 

132 Ignored. 

133 ``np.ndarray`` 

134 Provided ``dtype`` is not ``object``, hence arrays of ``None`` are not allowed. 

135 ``list``, ``tuple`` 

136 Provided they can be cast to allowed, i.e. non-ragged ``np.ndarray`` 

137 ``dict`` 

138 Assuming entries are allowed types. Keys and entries are concatenated. 

139 If an entry is ``None``, the key is added to the hash while the entry is ignored. 

140 """ 

141 hashing_function = hashlib.new(hashing_algorithm) 

142 for item in list_to_hash: 

143 if isinstance(item, np.ndarray) or type(item) in (list, tuple): 

144 _array_hasher(hashing_function, item) 

145 elif type(item) is dict: 

146 _dict_hasher(hashing_function, item) 

147 else: 

148 _item_hasher(hashing_function, item) 

149 return hashing_function.hexdigest()