Python 3 - Can pickle handle byte objects larger than 4GB?

Python 3 - Can pickle handle byte objects larger than 4GB? - python

Based on this comment and the referenced documentation, Pickle 4.0+ from Python 3.4+ should be able to pickle byte objects larger than 4 GB.
However, using python 3.4.3 or python 3.5.0b2 on Mac OS X 10.10.4, I get an error when I try to pickle a large byte array:
>>> import pickle
>>> x = bytearray(8 * 1000 * 1000 * 1000)
>>> fp = open("x.dat", "wb")
>>> pickle.dump(x, fp, protocol = 4)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
OSError: [Errno 22] Invalid argument
Is there a bug in my code or am I misunderstanding the documentation?

Here is a simple workaround for issue 24658. Use pickle.loads or pickle.dumps and break the bytes object into chunks of size 2**31 - 1 to get it in or out of the file.
import pickle
import os.path
file_path = "pkl.pkl"
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)
## write
bytes_out = pickle.dumps(data)
with open(file_path, 'wb') as f_out:
for idx in range(0, len(bytes_out), max_bytes):
f_out.write(bytes_out[idx:idx+max_bytes])
## read
bytes_in = bytearray(0)
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
for _ in range(0, input_size, max_bytes):
bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)
assert(data == data2)

To sum up what was answered in the comments:
Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).

Here is the full workaround, though it seems pickle.load no longer tries to dump a huge file anymore (I am on Python 3.5.2) so strictly speaking only the pickle.dumps needs this to work properly.
import pickle
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
# print("reading total_bytes=%s" % n, flush=True)
if n >= (1 << 31):
buffer = bytearray(n)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
# print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
buffer[idx:idx + batch_size] = self.f.read(batch_size)
# print("done.", flush=True)
idx += batch_size
return buffer
return self.f.read(n)
def write(self, buffer):
n = len(buffer)
print("writing total_bytes=%s..." % n, flush=True)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
self.f.write(buffer[idx:idx + batch_size])
print("done.", flush=True)
idx += batch_size
def pickle_dump(obj, file_path):
with open(file_path, "wb") as f:
return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_load(file_path):
with open(file_path, "rb") as f:
return pickle.load(MacOSFile(f))

You can specify the protocol for the dump.
If you do pickle.dump(obj,file,protocol=4) it should work.

Reading a file by 2GB chunks takes twice as much memory as needed if bytes concatenation is performed, my approach to loading pickles is based on bytearray:
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
if n >= (1 << 31):
buffer = bytearray(n)
pos = 0
while pos < n:
size = min(n - pos, 1 << 31 - 1)
chunk = self.f.read(size)
buffer[pos:pos + size] = chunk
pos += size
return buffer
return self.f.read(n)
Usage:
with open("/path", "rb") as fin:
obj = pickle.load(MacOSFile(fin))

Had the same issue and fixed it by upgrading to Python 3.6.8.
This seems to be the PR that did it: https://github.com/python/cpython/pull/9937

I also found this issue, to solve this problem i chunk the code into several iteration. Let say in this case i have 50.000 data which i have to calc tf-idf and do knn classfication. When i run and directly iterate 50.000 it give me "that error". So, to solve this problem i chunk it.
tokenized_documents = self.load_tokenized_preprocessing_documents()
idf = self.load_idf_41227()
doc_length = len(documents)
for iteration in range(0, 9):
tfidf_documents = []
for index in range(iteration, 4000):
doc_tfidf = []
for term in idf.keys():
tf = self.term_frequency(term, tokenized_documents[index])
doc_tfidf.append(tf * idf[term])
doc = documents[index]
tfidf = [doc_tfidf, doc[0], doc[1]]
tfidf_documents.append(tfidf)
print("{} from {} document {}".format(index, doc_length, doc[0]))
self.save_tfidf_41227(tfidf_documents, iteration)

Related

Read nth line from file efficiently in Python

I have a large txt that contains about 100.000.000 rows (I cannot read it to the memory as a whole). I would like to read n-th row efficiently. I found this How can I get python to read every nth line of a .txt file? and I constructed this function:
from itertools import islice
def read_n_line(file: str, n: int, encoding='utf-8') -> str:
with open(file, encoding=encoding) as f:
return next(islice(f, n - 1, n))
The problem is that my function is fast (0.5 seconds) for n = 1000, but slow (15 seconds) for n = 10.000.000. Can I somehow improve my function to be fast for all n, please?

For sufficiently large files, it may be more efficient to use a Numba-based approach:
import numba as nb
#nb.njit
def process(
block,
n,
last_nl_pos,
nl_pos,
nl_count,
offset,
nl=ord("\n")
):
nl = ord("\n")
for i, c in enumerate(block, offset):
if c == nl:
found = True
last_nl_pos = nl_pos
nl_pos = i
nl_count += 1
if nl_count == n:
break
return last_nl_pos, nl_pos, nl_count
def read_nth_line_nb(
filepath: str,
n: int,
encoding="utf-8",
size=2 ** 22, # 4 MiB
) -> str:
with open(filepath, "rb") as file_obj:
last_nl_pos = nl_pos = -1
nl_count = -1
offset = 0
while True:
block = file_obj.read(size)
if block:
last_nl_pos, nl_pos, nl_count = process(
block, n, last_nl_pos, nl_pos, nl_count, offset
)
offset += size
if nl_count == n:
file_obj.seek(last_nl_pos + 1)
return file_obj.read(nl_pos - last_nl_pos).decode(encoding)
else:
return
This essentially processes the file in blocks, keeping tracks of how many and where the new lines are (and how far is the block on the file).
For comparison I use the itertools.islice() approach:
import itertools
def read_nth_line_isl(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
return next(itertools.islice(file_obj, n, n + 1), None)
as well as the naïve looping:
def read_nth_line_loop(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
for i, line in enumerate(file_obj):
if i == n:
return line
return None
Assuming some files were created with the following:
import random
import string
def write_some_file(filepath: str, n: int, m: int = 10, l: int = 100, encoding="utf-8") -> None:
with open(filepath, "w", encoding=encoding) as file_obj:
for i in range(n):
line = "".join(random.choices(string.ascii_letters, k=random.randrange(m, l)))
file_obj.write(f"{i:0{k}d} - {line}\n")
k = 9
for i in range(1, k):
n_max = 10 ** i
print(n_max)
write_some_file(f"test{n_max:0{k}d}.txt", n_max)
It is possible to test that they all give the same result:
funcs = read_nth_line_isl, read_nth_line_loop, read_nth_line_nb
k = 9
n_max = 10 ** 5
filename = f"test{n_max:0{k}d}.txt"
for func in funcs:
print(f"{func.__name__:>20} {func(filename, n_max - 1)!r}")
# read_nth_line_isl '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_loop '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_nb '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
The timings can be computed with:
k = 9
timings = {}
for i in range(1, k - 1):
n_max = 10 ** i
filename = f"test{n_max:0{k}d}.txt"
print(filename)
timings[i] = []
base = funcs[0](filename, n_max - 1)
for func in funcs:
res = func(filename, n_max - 1)
is_good = base == res
if i < 6:
timed = %timeit -r 12 -n 12 -q -o func(filename, n_max - 1)
else:
timed = %timeit -r 1 -n 1 -q -o func(filename, n_max - 1)
timing = timed.best * 1e3
timings[i].append(timing if is_good else None)
print(f"{func.__name__:>24} {is_good!s:5} {timing:10.3f} ms")
and plotted with:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(data=timings, index=[func.__name__ for func in funcs]).transpose()
df.plot(marker='o', xlabel='log₁₀(Input Size) / #', ylabel='Best timing / µs', figsize=(6, 4), logy=True)
fig = plt.gcf()
fig.patch.set_facecolor('white')
to obtain:
Indicating the Numba-based approach to be marginally faster (some 5-15%) for sufficiently large inputs (above 10⁵).

passing files and values as parameter to a function in python

I am a python newbie. I am trying to run this simple python example. I am wish to pass files and certain values as parameter to my function latcalc(). Could anyone suggest how I can pass my files and values as parameters. Or is there any better way/approach to do these things.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
global filename1
global filename2
global filename3
def openfiles():
filename1 = open("file1.txt", "w")
filename2 = open("file2.txt", "w")
filename3 = open("file3.txt", "w")
def latcalc(filename,target_name,vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
filename.write(myline)
openfiles()
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)

I would create a little class (give it a useful name) to encapsulate your data.
If your files grow you only have to change your create_lats
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
#Little class to keep our data in one place
class Lat:
def __init__(self, filename, factor):
self.filename = filename
self.factor = factor
self.file = open(filename, "w") #let the class open the file
#now our function needs only one parameter, neat!
def latcalc(lat):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length / (lat.factor * c_vaccum)) #acces the class variable
elif length == 2:
target_name += delay
else:
target_name = target_name
myline = "%s\t%s\n" % (length, target_name)
lat.file.write(myline)
def create_lats():
lats = []
lats.append(Lat("file1.txt", 0.4))
lats.append(Lat("file2.txt", 0.8))
lats.append(Lat("file3.txt", 1))
return lats
#loop over your lats created in create_lats
for lat in create_lats():
latcalc(lat)
lat.file.close() #close the file

try something like this (notice the globals are gone):
def openfiles(namelist):
ret = []
for name in filelist:
fi = open(name, 'w')
ret.append(fi)
return ret
filelist = ['file1.txt', 'file2.txt', 'file3.txt']
handles = openfiles(filelist)
for handle in handles:
<do what ever you want>
handles will be a list of file handles corresponding to the filelist of names
note the file handle is what you pass around to do reads & writes with
also the opens could be done in the call to latcalc, since you would be doing one file per call apparently

As some comments point out, you don't need global variables and you should close your filehandler objects after you finished writing to them which is most conveniently done with 'with' (closing is done for you, even in case of an unexpected exception):
#!/usr/bin/python
min_length = 1
max_length = 3
delay = 100
c_vaccum = 3e8
def latcalc(filename, vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
myline="%s\t%d\n" % (length, target_name)
with open(filename, "w") as f:
f.write(myline)
return target_name
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
The way you treat the parameter target_name, I assume, you are used to C-type pointers which do not exist in that form in Python. The parameter is pointless here if you set it to a new value in the first line of latcalc(). Also, you seem to treat target_name as a string when it is an int:
myline="%s\t%s\n" % (length, target_name)
If you need target_name after the method has finished, you would have to return it.

1) open() gives you a filehandler, and not a filename
2) Use a "with" statement for opening a file, to avoid "forgetting" closing the file when finished.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
def latcalc(filename, target_name, vf):
with open(filename, "w") as openedFile:
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
openedFile.write(myline)
latcalc("file1.txt", "lat40", 0.4)
latcalc("file2.txt", "lat80", 0.8)
latcalc("file3.txt", "lat100", 1)

Python File seek + write output weird "NUL" in the file

I'm writting a downloader that will split url to parts and download with threading, probably I will not use "join" because join = unable to stream (cannot write file if all thread is not finish)
But problem is f.seek and write output really weird file, content of the file always have "NUL" character (in Notepad++) and text in the file is just 1/3 of the whole file.
He everybody, thank everybody for helping me, here is my version 2.0 of the code, thank Padraic Cunningham for his suggestion and exlaination, I fix my code almost like what you've suggested:
So please help me check the code, and I think need you guy help to convert it to http.server file streamming method:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(url, numsplits):
global pool
value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
print("Fullsize: ", value)
print("Try devide with 3 :", value / 3)
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
if i == 0:
lst.append('%s-%s' % (0, value//numsplits))
else:
lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
return lst
def main(url=None, splitBy=3):
global fileName, pool, countsize
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
#print("%s bytes to download." % sizeInBytes)
# if not sizeInBytes:
# print("Size cannot be determined.")
# return
#sinzeInBytes = buildRange(url,
dataDict = {}
f = open(fileName,'wb')
# split total num bytes into ranges
#ranges = buildRange(url,int(sizeInBytes), splitBy)
ranges = buildRange(url, splitBy)
print(ranges)
def downloadChunk(idx, irange):
print(idx)
#time.sleep(1*idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
#print(data)
#print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
print(offset)
# print(irange)
f.seek(offset, 0)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
countsize = countsize + offset
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
print(countsize)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
if os.path.exists(fileName):
os.remove(fileName)
main(URL, splitBy=16)
Here is my code, please help me fix it: Version 1.0, ignore it, version 2.0 above:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
global fileName, pool
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
print(idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
print(data)
print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
#print(offset)
# print(irange)
f = open(fileName,'wb')
f.seek(offset)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
main(URL, splitBy=3)

You use three threads where your target function is downloadChunk, you open the file three times using wb which overwrites so you get 1/3 of the content. You also call seek for no apparent reason. If you wanted to append to a file you would open using a each time or just open the file once outside the functions.
You are trying to seek using an empty file and write so that is where the null bytes come from.
If you want to open a file for reading and writing so you can seek with line buffering:
with open("whatever.file", "r+b",buffering=1) as f
Then use that file to write to, don't keep opening in the function and overwriting, the file must also exist.

Python write to file based on offset

I want to create a python program which splits up a files into segments of specified width, and then a consumer program takes the segments and creates a duplicate of the original file. The segments might be out of order so I intent to use the offset value to write to the file.
Is there a way I can achieve this with without creating a local array to hold all the data on the receiving end?
for example,
f = open(file, "wb")
f.seek(offset)
f.write(data)
The idea behind this is that the program that sends the file might not be able to finish sending the file, and will resume again once it has started.
I have a sample code below which the "combine_bytes" function throws an exception when I try placing data in the buffer location.
import sys
import os
def SplitFile(fname, start, end, width):
t_fileSize = os.path.getsize(fname)
buffData = bytearray(t_fileSize)
for line, offset in get_bytes(fname, int(start), int(end), int(width)):
combine_bytes(buffData, offset, line, width)
nums = ["%02x" % ord(c) for c in line]
print " ".join(nums)
f = open("Green_copy.jpg", "wb")
f.write(buffData)
f.close()
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
#this works but it's the mother of inefficiency
i = in_offset
for c in in_data:
in_buff.insert(i, c)
i = i + 1
def get_bytes(fname, start, end, width):
t_currOffset = start
t_width = width
f = open(fname, "r+b")
if end != 0:
while t_currOffset < end:
f.seek(t_currOffset)
if (t_currOffset + t_width) > end:
t_width = end - t_currOffset
t_data = f.read(t_width)
yield t_data,t_currOffset
t_currOffset += t_width
else:
f.seek(t_currOffset)
t_data = f.read(t_width)
while t_data:
yield t_data, t_currOffset
t_currOffset += t_width
f.seek(t_currOffset)
t_data = f.read(t_width)
f.close()
if __name__ == '__main__':
try:
SplitFile(*sys.argv[1:5])
except:
print "Unexpected error:", sys.exc_info()[0]

I still could nt figure out what is your intent - but this version of combine_bytes will get rid of your "mother of your inefficiency" part (which actually is exactly that)
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
in_buff = in_buff[:in_offset] + in_data + in_buff[in_offset:]
return in_buff
Of course this creates a new (larger) buffer for each call, and you have to replace your buffer on the caller scope with the one returned:
buffData = combine_bytes(buffData, offset, line, width)

Found it. here is a better way which produces the what I wanted and is faster. _buffData[t_offset:t_offset + len(t_data)] = bytearray(t_data)

compute crc of file in python

I want to calculate the CRC of file and get output like: E45A12AC. Here's my code:
#!/usr/bin/env python
import os, sys
import zlib
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
for eachLine in content:
zlib.crc32(eachLine)
for eachFile in sys.argv[1:]:
crc(eachFile)
This calculates the CRC for each line, but its output (e.g. -1767935985) is not what I want.
Hashlib works the way I want, but it computes the md5:
import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
m.update(line)
print m.hexdigest()
Is it possible to get something similar using zlib.crc32?

A little more compact and optimized code
def crc(fileName):
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
PS2: Old PS is deprecated - therefore deleted -, because of the suggestion in the comment. Thank you. I don't get, how I missed this, but it was really good.

A modified version of kobor42's answer, with performance improved by a factor 2-3 by reading fixed size chunks instead of "lines":
import zlib
def crc32(fileName):
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
Also includes leading zeroes in the returned string.

hashlib-compatible interface for CRC-32 support:
import zlib
class crc32(object):
name = 'crc32'
digest_size = 4
block_size = 1
def __init__(self, arg=''):
self.__digest = 0
self.update(arg)
def copy(self):
copy = super(self.__class__, self).__new__(self.__class__)
copy.__digest = self.__digest
return copy
def digest(self):
return self.__digest
def hexdigest(self):
return '{:08x}'.format(self.__digest)
def update(self, arg):
self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff
# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32
# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')

To show any integer's lowest 32 bits as 8 hexadecimal digits, without sign, you can "mask" the value by bit-and'ing it with a mask made of 32 bits all at value 1, then apply formatting. I.e.:
>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'
It's quite irrelevant whether the integer you are thus formatting comes from zlib.crc32 or any other computation whatsoever.

Python 3.8+ (using the walrus operator):
import zlib
def crc32(filename, chunksize=65536):
"""Compute the CRC-32 checksum of the contents of the given filename"""
with open(filename, "rb") as f:
checksum = 0
while (chunk := f.read(chunksize)) :
checksum = zlib.crc32(chunk, checksum)
return checksum
chunksize is how many bytes to read from the file at a time. You will get the same CRC for the same file no matter what you set chunksize to (it has to be > 0), but setting it too low might make your code slow, too high might use too much memory.
The result is a 32 bit integer. The CRC-32 checksum of an empty file is 0.

Edited to include Altren's solution below.
A modified and more compact version of CrouZ's answer, with a slightly improved performance, using a for loop and file buffering:
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
Results, in a 6700k, HDD:
(Note: Retested multiple times and it was consistently faster.)
Warming up the machine...
Finished.
Beginning tests...
File size: 90288KB
Test cycles: 500
With for loop and buffer.
Result 45.24728019630359
CrouZ solution
Result 45.433838356097894
kobor42 solution
Result 104.16215688703986
Altren solution
Result 101.7247863946586
Tested in Python 3.6.4 x64 using the script below:
import os, timeit, zlib, random, binascii
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
def crc32(fileName):
"""CrouZ solution"""
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
def crc(fileName):
"""kobor42 solution"""
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
def crc32altren(filename):
"""Altren solution"""
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash
fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.',
crc32: 'CrouZ solution', crc: 'kobor42 solution',
crc32altren: 'Altren solution'}
count = 500
# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
randomItm[0](fpath)
print('Finished.\n')
# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
os.stat(fpath).st_size/1024, count))
for x in tests:
print(tests[x])
start_time = timeit.default_timer()
for c in range(count):
x(fpath)
print('Result', timeit.default_timer() - start_time, '\n')
It is faster because for loops are faster than while loops (sources: here and here).

Merge the above 2 codes as below:
try:
fd = open(decompressedFile,"rb")
except IOError:
logging.error("Unable to open the file in readmode:" + decompressedFile)
return 4
eachLine = fd.readline()
prev = 0
while eachLine:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()

There is faster and more compact way to compute CRC using binascii:
import binascii
def crc32(filename):
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash

You can use base64 for getting out like [ERD45FTR]. And zlib.crc32 provides update options.import os, sys
import zlib
import base64
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
prev = None
for eachLine in content:
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
return prev
for eachFile in sys.argv[1:]:
print base64.b64encode(str(crc(eachFile)))

solution:
import os, sys
import zlib
def crc(fileName, excludeLine="", includeLine=""):
try:
fd = open(fileName,"rb")
except IOError:
print "Unable to open the file in readmode:", filename
return
eachLine = fd.readline()
prev = None
while eachLine:
if excludeLine and eachLine.startswith(excludeLine):
continue
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()
return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc
for eachFile in sys.argv[1:]:
print crc(eachFile)
don't realy know for what is (excludeLine="", includeLine="")...

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python 3 - Can pickle handle byte objects larger than 4GB? - python

To sum up what was answered in the comments: Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).

You can specify the protocol for the dump. If you do pickle.dump(obj,file,protocol=4) it should work.

Had the same issue and fixed it by upgrading to Python 3.6.8. This seems to be the PR that did it: https://github.com/python/cpython/pull/9937

Related

Read nth line from file efficiently in Python

passing files and values as parameter to a function in python

Python File seek + write output weird "NUL" in the file

Python write to file based on offset

compute crc of file in python

Categories

Resources