Say you want to compute a digest on very long input, so large that you your laptop might get switched off, or it comes in batches with long breaks, or you want to use several machines (sequentially) or you want to be able to restart the program e.g. to update it to new version. Or maybe your input has a very long head and several tails, e.g. it's a tree and you want to reuse hash computed over head. Or maybe you want to save partial hash in the database
And of course you want to compute hash fast, so you want to use CPython's built-in implementation written in C, or in this case the one that uses OpenSSL.
Normally CPython's hashlib hash objects don't offer you a way to save their state, they are not picklable, and internals are not accessible from Python.
With ctypes, of course everything is possible:
#!/usr/bin/python
""" save and restore sha512 inner state
supports 32-bit and 64-bit architectures
tested on CPython 2.6 and 2.7
TODO does not take endian into account
TODO assumes Python compiled with OpenSSL
"""
from hashlib import sha512
import ctypes
import binascii
POFFSET = 6
STATESIZE = 216
def save(obj):
"""return inner state of sha512 `obj` as raw string"""
#assert isinstance(obj, sha512)
datap = ctypes.cast(ctypes.cast(id(obj),
ctypes.POINTER(ctypes.c_voidp))[POFFSET],
ctypes.POINTER(ctypes.c_char))
assert datap
return datap[:STATESIZE]
def restore(data):
"""create new sha512 object with inner state from `data`, str/bytes or iterable"""
new = sha512()
datap = ctypes.cast(ctypes.cast(id(new),
ctypes.POINTER(ctypes.c_voidp))[POFFSET],
ctypes.POINTER(ctypes.c_char))
assert datap
assert datap[:8] == '\x08\xc9\xbc\xf3g\xe6\tj' # first sha512 word
for i, byte in enumerate(data):
assert i < STATESIZE
datap[i] = byte
assert i + 1 == STATESIZE
return new
savehex = lambda o: binascii.b2a_hex(save(o))
restorehex = lambda d: restore(binascii.a2b_hex(d))
if __name__ == "__main__":
# different data lengths
testdata = ["", "abcd" * 256, "o" * 13, "y" * 256]
real = sha512()
for test in testdata:
real.update(test)
# invariant x == restore(save(x))
assert real.digest() == restore(save(real)).digest()
assert real.hexdigest() == restorehex(savehex(real)).hexdigest()
Of course I'm not the first person to consider this: [e.g. java]
And of course you want to compute hash fast, so you want to use CPython's built-in implementation written in C, or in this case the one that uses OpenSSL.
Normally CPython's hashlib hash objects don't offer you a way to save their state, they are not picklable, and internals are not accessible from Python.
With ctypes, of course everything is possible:
#!/usr/bin/python
""" save and restore sha512 inner state
supports 32-bit and 64-bit architectures
tested on CPython 2.6 and 2.7
TODO does not take endian into account
TODO assumes Python compiled with OpenSSL
"""
from hashlib import sha512
import ctypes
import binascii
POFFSET = 6
STATESIZE = 216
def save(obj):
"""return inner state of sha512 `obj` as raw string"""
#assert isinstance(obj, sha512)
datap = ctypes.cast(ctypes.cast(id(obj),
ctypes.POINTER(ctypes.c_voidp))[POFFSET],
ctypes.POINTER(ctypes.c_char))
assert datap
return datap[:STATESIZE]
def restore(data):
"""create new sha512 object with inner state from `data`, str/bytes or iterable"""
new = sha512()
datap = ctypes.cast(ctypes.cast(id(new),
ctypes.POINTER(ctypes.c_voidp))[POFFSET],
ctypes.POINTER(ctypes.c_char))
assert datap
assert datap[:8] == '\x08\xc9\xbc\xf3g\xe6\tj' # first sha512 word
for i, byte in enumerate(data):
assert i < STATESIZE
datap[i] = byte
assert i + 1 == STATESIZE
return new
savehex = lambda o: binascii.b2a_hex(save(o))
restorehex = lambda d: restore(binascii.a2b_hex(d))
if __name__ == "__main__":
# different data lengths
testdata = ["", "abcd" * 256, "o" * 13, "y" * 256]
real = sha512()
for test in testdata:
real.update(test)
# invariant x == restore(save(x))
assert real.digest() == restore(save(real)).digest()
assert real.hexdigest() == restorehex(savehex(real)).hexdigest()
Of course I'm not the first person to consider this: [e.g. java]

