compute crc of file in python - python

I want to calculate the CRC of file and get output like: E45A12AC. Here's my code:
#!/usr/bin/env python
import os, sys
import zlib
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
for eachLine in content:
zlib.crc32(eachLine)
for eachFile in sys.argv[1:]:
crc(eachFile)
This calculates the CRC for each line, but its output (e.g. -1767935985) is not what I want.
Hashlib works the way I want, but it computes the md5:
import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
m.update(line)
print m.hexdigest()
Is it possible to get something similar using zlib.crc32?

A little more compact and optimized code
def crc(fileName):
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
PS2: Old PS is deprecated - therefore deleted -, because of the suggestion in the comment. Thank you. I don't get, how I missed this, but it was really good.

A modified version of kobor42's answer, with performance improved by a factor 2-3 by reading fixed size chunks instead of "lines":
import zlib
def crc32(fileName):
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
Also includes leading zeroes in the returned string.

hashlib-compatible interface for CRC-32 support:
import zlib
class crc32(object):
name = 'crc32'
digest_size = 4
block_size = 1
def __init__(self, arg=''):
self.__digest = 0
self.update(arg)
def copy(self):
copy = super(self.__class__, self).__new__(self.__class__)
copy.__digest = self.__digest
return copy
def digest(self):
return self.__digest
def hexdigest(self):
return '{:08x}'.format(self.__digest)
def update(self, arg):
self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff
# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32
# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')

To show any integer's lowest 32 bits as 8 hexadecimal digits, without sign, you can "mask" the value by bit-and'ing it with a mask made of 32 bits all at value 1, then apply formatting. I.e.:
>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'
It's quite irrelevant whether the integer you are thus formatting comes from zlib.crc32 or any other computation whatsoever.

Python 3.8+ (using the walrus operator):
import zlib
def crc32(filename, chunksize=65536):
"""Compute the CRC-32 checksum of the contents of the given filename"""
with open(filename, "rb") as f:
checksum = 0
while (chunk := f.read(chunksize)) :
checksum = zlib.crc32(chunk, checksum)
return checksum
chunksize is how many bytes to read from the file at a time. You will get the same CRC for the same file no matter what you set chunksize to (it has to be > 0), but setting it too low might make your code slow, too high might use too much memory.
The result is a 32 bit integer. The CRC-32 checksum of an empty file is 0.

Edited to include Altren's solution below.
A modified and more compact version of CrouZ's answer, with a slightly improved performance, using a for loop and file buffering:
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
Results, in a 6700k, HDD:
(Note: Retested multiple times and it was consistently faster.)
Warming up the machine...
Finished.
Beginning tests...
File size: 90288KB
Test cycles: 500
With for loop and buffer.
Result 45.24728019630359
CrouZ solution
Result 45.433838356097894
kobor42 solution
Result 104.16215688703986
Altren solution
Result 101.7247863946586
Tested in Python 3.6.4 x64 using the script below:
import os, timeit, zlib, random, binascii
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
def crc32(fileName):
"""CrouZ solution"""
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
def crc(fileName):
"""kobor42 solution"""
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
def crc32altren(filename):
"""Altren solution"""
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash
fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.',
crc32: 'CrouZ solution', crc: 'kobor42 solution',
crc32altren: 'Altren solution'}
count = 500
# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
randomItm[0](fpath)
print('Finished.\n')
# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
os.stat(fpath).st_size/1024, count))
for x in tests:
print(tests[x])
start_time = timeit.default_timer()
for c in range(count):
x(fpath)
print('Result', timeit.default_timer() - start_time, '\n')
It is faster because for loops are faster than while loops (sources: here and here).

Merge the above 2 codes as below:
try:
fd = open(decompressedFile,"rb")
except IOError:
logging.error("Unable to open the file in readmode:" + decompressedFile)
return 4
eachLine = fd.readline()
prev = 0
while eachLine:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()

There is faster and more compact way to compute CRC using binascii:
import binascii
def crc32(filename):
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash

You can use base64 for getting out like [ERD45FTR]. And zlib.crc32 provides update options.import os, sys
import zlib
import base64
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
prev = None
for eachLine in content:
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
return prev
for eachFile in sys.argv[1:]:
print base64.b64encode(str(crc(eachFile)))

solution:
import os, sys
import zlib
def crc(fileName, excludeLine="", includeLine=""):
try:
fd = open(fileName,"rb")
except IOError:
print "Unable to open the file in readmode:", filename
return
eachLine = fd.readline()
prev = None
while eachLine:
if excludeLine and eachLine.startswith(excludeLine):
continue
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()
return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc
for eachFile in sys.argv[1:]:
print crc(eachFile)
don't realy know for what is (excludeLine="", includeLine="")...

Related

Python 3 - Can pickle handle byte objects larger than 4GB?

Based on this comment and the referenced documentation, Pickle 4.0+ from Python 3.4+ should be able to pickle byte objects larger than 4 GB.
However, using python 3.4.3 or python 3.5.0b2 on Mac OS X 10.10.4, I get an error when I try to pickle a large byte array:
>>> import pickle
>>> x = bytearray(8 * 1000 * 1000 * 1000)
>>> fp = open("x.dat", "wb")
>>> pickle.dump(x, fp, protocol = 4)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
OSError: [Errno 22] Invalid argument
Is there a bug in my code or am I misunderstanding the documentation?
Here is a simple workaround for issue 24658. Use pickle.loads or pickle.dumps and break the bytes object into chunks of size 2**31 - 1 to get it in or out of the file.
import pickle
import os.path
file_path = "pkl.pkl"
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)
## write
bytes_out = pickle.dumps(data)
with open(file_path, 'wb') as f_out:
for idx in range(0, len(bytes_out), max_bytes):
f_out.write(bytes_out[idx:idx+max_bytes])
## read
bytes_in = bytearray(0)
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
for _ in range(0, input_size, max_bytes):
bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)
assert(data == data2)
To sum up what was answered in the comments:
Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).
Here is the full workaround, though it seems pickle.load no longer tries to dump a huge file anymore (I am on Python 3.5.2) so strictly speaking only the pickle.dumps needs this to work properly.
import pickle
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
# print("reading total_bytes=%s" % n, flush=True)
if n >= (1 << 31):
buffer = bytearray(n)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
# print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
buffer[idx:idx + batch_size] = self.f.read(batch_size)
# print("done.", flush=True)
idx += batch_size
return buffer
return self.f.read(n)
def write(self, buffer):
n = len(buffer)
print("writing total_bytes=%s..." % n, flush=True)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
self.f.write(buffer[idx:idx + batch_size])
print("done.", flush=True)
idx += batch_size
def pickle_dump(obj, file_path):
with open(file_path, "wb") as f:
return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_load(file_path):
with open(file_path, "rb") as f:
return pickle.load(MacOSFile(f))
You can specify the protocol for the dump.
If you do pickle.dump(obj,file,protocol=4) it should work.
Reading a file by 2GB chunks takes twice as much memory as needed if bytes concatenation is performed, my approach to loading pickles is based on bytearray:
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
if n >= (1 << 31):
buffer = bytearray(n)
pos = 0
while pos < n:
size = min(n - pos, 1 << 31 - 1)
chunk = self.f.read(size)
buffer[pos:pos + size] = chunk
pos += size
return buffer
return self.f.read(n)
Usage:
with open("/path", "rb") as fin:
obj = pickle.load(MacOSFile(fin))
Had the same issue and fixed it by upgrading to Python 3.6.8.
This seems to be the PR that did it: https://github.com/python/cpython/pull/9937
I also found this issue, to solve this problem i chunk the code into several iteration. Let say in this case i have 50.000 data which i have to calc tf-idf and do knn classfication. When i run and directly iterate 50.000 it give me "that error". So, to solve this problem i chunk it.
tokenized_documents = self.load_tokenized_preprocessing_documents()
idf = self.load_idf_41227()
doc_length = len(documents)
for iteration in range(0, 9):
tfidf_documents = []
for index in range(iteration, 4000):
doc_tfidf = []
for term in idf.keys():
tf = self.term_frequency(term, tokenized_documents[index])
doc_tfidf.append(tf * idf[term])
doc = documents[index]
tfidf = [doc_tfidf, doc[0], doc[1]]
tfidf_documents.append(tfidf)
print("{} from {} document {}".format(index, doc_length, doc[0]))
self.save_tfidf_41227(tfidf_documents, iteration)

How to zip a very large file in python

I would like to zip a couple of files that may amount to about 99 GB using python. Please what is the most efficient way to do this using the zipfile library. This is a sample code I have
with gcs.open(zip_file_name, 'w', content_type=b'application/zip') as f:
with zipfile.ZipFile(f, 'w') as z:
for file in files:
is_owner = (is_page_allowed_to_visitor(page, visitor) or (file.owner_id == visitor.id) )
if is_owner:
file.show = True
elif file.available_from:
if file.available_from > datetime.now():
file.show = False
elif file.available_to:
if file.available_to < datetime.now():
file.show = False
else:
file.show = True
if file.show:
file_name = "/%s/%s" % (gcs_store.get_bucket_name(), file.gcs_name)
gcs_reader = gcs.open(file_name, 'r')
z.writestr('%s-%s' %(file.created_on, file.name), gcs_reader.read() )
gcs_reader.close()
f.close() #closing zip file
Some points to note:
1) I am using the google app engine to host the files so I cannot use the zipfile.write() method. I can only get the file contents in bytes.
Thanks in advance
I have added a new method to the zipfile library. This enhanced zipfile library is open source and can be found on github (EnhancedZipFile). I added a new method with the inspiration from the zipfile.write() method and the zipfile.writestr()method
def writebuffered(self, zinfo_or_arcname, file_pointer, file_size, compress_type=None):
if not isinstance(zinfo_or_arcname, ZipInfo):
zinfo = ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
if zinfo.filename[-1] == '/':
zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
zinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
zinfo.external_attr = 0o600 << 16 # ?rw-------
else:
zinfo = zinfo_or_arcname
zinfo.file_size = file_size # Uncompressed size
zinfo.header_offset = self.fp.tell() # Start of header bytes
self._writecheck(zinfo)
self._didModify = True
fp = file_pointer
# Must overwrite CRC and sizes with correct data later
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
# Compressed size can be larger than uncompressed size
zip64 = self._allowZip64 and \
zinfo.file_size * 1.05 > ZIP64_LIMIT
self.fp.write(zinfo.FileHeader(zip64))
if zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
zlib.DEFLATED, -15)
else:
cmpr = None
file_size = 0
while 1:
buf = fp.read(1024 * 8)
if not buf:
break
file_size = file_size + len(buf)
CRC = crc32(buf, CRC) & 0xffffffff
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
if not zip64 and self._allowZip64:
if file_size > ZIP64_LIMIT:
raise RuntimeError('File size has increased during compressing')
if compress_size > ZIP64_LIMIT:
raise RuntimeError('Compressed size larger than uncompressed size')
# Seek backwards and write file header (which will now include
# correct CRC and file sizes)
position = self.fp.tell() # Preserve current position in file
self.fp.flush()
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
Points to note
I am a newbie in python so the code I wrote above may not be very optimized.
Please contribute to the project on github here https://github.com/najela/EnhancedZipFile

Searching a file for matches between two values and outputting search hits in Python

I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here

python read from fd directly into bytearray

Is there a means to read from a file descriptor (not an IO-like object) directly into a bytearray?
Right now I use a temporary FileIO object to mediate, something like:
def fd_readinto(fd, ba):
fio = io.FileIO(fd, closefd = False)
return fio.readinto(ba)
There is no function that does this, and your method is already the fastest approach.
I was going to suggest bytearray(mmap), array.fromfile, and even a homebrew os.read() using bytearray and memoryview, but FileIO.readinto is screaming fast. (It makes sense that it would be because it performs only one system call.)
import os
import mmap, io, array
import timeit
fn = 'path-to-largeish-file'
def fd_readinto_mmap(fd, ba):
m = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
ba.extend(m)
m.close()
def fd_readinto_fio(fd, ba):
sz = os.fstat(fd).st_size
ba2 = bytearray(sz)
with io.FileIO(fd, closefd = False) as fio:
fio.readinto(ba2)
ba.extend(ba2)
def fd_readinto_array(fd, ba):
ar = array.array('c')
sz = os.fstat(fd).st_size
fp = os.fdopen(fd, 'rb')
ar.fromfile(fp, sz)
ba.extend(ar)
def fd_readinto_mv(fd, ba):
stat = os.fstat(fd)
blksize = getattr(stat, 'st_blksize', 4096)
bufsize = stat.st_size
buf = bytearray(bufsize)
m = memoryview(buf)
while True:
b = os.read(fd, blksize)
s = len(b)
if not s: break
m[:s], m = b, m[s:]
writtenbytes = buffer(buf, 0, bufsize-len(m))
ba.extend(writtenbytes)
setup = """
from __main__ import fn, fd_readinto_mmap, fd_readinto_fio, fd_readinto_array, fd_readinto_mv
import os
openfd = lambda : os.open(fn, os.O_RDONLY)
closefd = lambda fd: os.close(fd)
"""
reps = 2
tests = {
'fio' : "fd=openfd(); fd_readinto_fio(fd, bytearray()); closefd(fd)",
'mmap': "fd=openfd(); fd_readinto_mmap(fd, bytearray()); closefd(fd)",
'array': "fd=openfd(); fd_readinto_array(fd, bytearray());",
'mv' : "fd=openfd(); fd_readinto_mv(fd, bytearray()); closefd(fd)",
}
width = max(map(len, tests))
for n,t in tests.iteritems():
time = timeit.timeit(t, setup, number=reps)
print ("{:%s} {}" % width).format(n, time)
On my system (OS X 10.14.6, Python 2.7.10), FileIO is the fastest option:
mmap 7.19839119911
array 5.72453403473
mv 0.49933886528
fio 0.299485206604

Python write to file based on offset

I want to create a python program which splits up a files into segments of specified width, and then a consumer program takes the segments and creates a duplicate of the original file. The segments might be out of order so I intent to use the offset value to write to the file.
Is there a way I can achieve this with without creating a local array to hold all the data on the receiving end?
for example,
f = open(file, "wb")
f.seek(offset)
f.write(data)
The idea behind this is that the program that sends the file might not be able to finish sending the file, and will resume again once it has started.
I have a sample code below which the "combine_bytes" function throws an exception when I try placing data in the buffer location.
import sys
import os
def SplitFile(fname, start, end, width):
t_fileSize = os.path.getsize(fname)
buffData = bytearray(t_fileSize)
for line, offset in get_bytes(fname, int(start), int(end), int(width)):
combine_bytes(buffData, offset, line, width)
nums = ["%02x" % ord(c) for c in line]
print " ".join(nums)
f = open("Green_copy.jpg", "wb")
f.write(buffData)
f.close()
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
#this works but it's the mother of inefficiency
i = in_offset
for c in in_data:
in_buff.insert(i, c)
i = i + 1
def get_bytes(fname, start, end, width):
t_currOffset = start
t_width = width
f = open(fname, "r+b")
if end != 0:
while t_currOffset < end:
f.seek(t_currOffset)
if (t_currOffset + t_width) > end:
t_width = end - t_currOffset
t_data = f.read(t_width)
yield t_data,t_currOffset
t_currOffset += t_width
else:
f.seek(t_currOffset)
t_data = f.read(t_width)
while t_data:
yield t_data, t_currOffset
t_currOffset += t_width
f.seek(t_currOffset)
t_data = f.read(t_width)
f.close()
if __name__ == '__main__':
try:
SplitFile(*sys.argv[1:5])
except:
print "Unexpected error:", sys.exc_info()[0]
I still could nt figure out what is your intent - but this version of combine_bytes will get rid of your "mother of your inefficiency" part (which actually is exactly that)
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
in_buff = in_buff[:in_offset] + in_data + in_buff[in_offset:]
return in_buff
Of course this creates a new (larger) buffer for each call, and you have to replace your buffer on the caller scope with the one returned:
buffData = combine_bytes(buffData, offset, line, width)
Found it. here is a better way which produces the what I wanted and is faster. _buffData[t_offset:t_offset + len(t_data)] = bytearray(t_data)

Categories