How to zip a very large file in python

How to zip a very large file in python - python

I would like to zip a couple of files that may amount to about 99 GB using python. Please what is the most efficient way to do this using the zipfile library. This is a sample code I have
with gcs.open(zip_file_name, 'w', content_type=b'application/zip') as f:
with zipfile.ZipFile(f, 'w') as z:
for file in files:
is_owner = (is_page_allowed_to_visitor(page, visitor) or (file.owner_id == visitor.id) )
if is_owner:
file.show = True
elif file.available_from:
if file.available_from > datetime.now():
file.show = False
elif file.available_to:
if file.available_to < datetime.now():
file.show = False
else:
file.show = True
if file.show:
file_name = "/%s/%s" % (gcs_store.get_bucket_name(), file.gcs_name)
gcs_reader = gcs.open(file_name, 'r')
z.writestr('%s-%s' %(file.created_on, file.name), gcs_reader.read() )
gcs_reader.close()
f.close() #closing zip file
Some points to note:
1) I am using the google app engine to host the files so I cannot use the zipfile.write() method. I can only get the file contents in bytes.
Thanks in advance

I have added a new method to the zipfile library. This enhanced zipfile library is open source and can be found on github (EnhancedZipFile). I added a new method with the inspiration from the zipfile.write() method and the zipfile.writestr()method
def writebuffered(self, zinfo_or_arcname, file_pointer, file_size, compress_type=None):
if not isinstance(zinfo_or_arcname, ZipInfo):
zinfo = ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
if zinfo.filename[-1] == '/':
zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
zinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
zinfo.external_attr = 0o600 << 16 # ?rw-------
else:
zinfo = zinfo_or_arcname
zinfo.file_size = file_size # Uncompressed size
zinfo.header_offset = self.fp.tell() # Start of header bytes
self._writecheck(zinfo)
self._didModify = True
fp = file_pointer
# Must overwrite CRC and sizes with correct data later
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
# Compressed size can be larger than uncompressed size
zip64 = self._allowZip64 and \
zinfo.file_size * 1.05 > ZIP64_LIMIT
self.fp.write(zinfo.FileHeader(zip64))
if zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
zlib.DEFLATED, -15)
else:
cmpr = None
file_size = 0
while 1:
buf = fp.read(1024 * 8)
if not buf:
break
file_size = file_size + len(buf)
CRC = crc32(buf, CRC) & 0xffffffff
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
if not zip64 and self._allowZip64:
if file_size > ZIP64_LIMIT:
raise RuntimeError('File size has increased during compressing')
if compress_size > ZIP64_LIMIT:
raise RuntimeError('Compressed size larger than uncompressed size')
# Seek backwards and write file header (which will now include
# correct CRC and file sizes)
position = self.fp.tell() # Preserve current position in file
self.fp.flush()
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
Points to note
I am a newbie in python so the code I wrote above may not be very optimized.
Please contribute to the project on github here https://github.com/najela/EnhancedZipFile

Related

Read ZIP files from S3 without downloading the entire file

We have ZIP files that are 5-10GB in size. The typical ZIP file has 5-10 internal files, each 1-5 GB in size uncompressed.
I have a nice set of Python tools for reading these files. Basically, I can open a filename and if there is a ZIP file, the tools search in the ZIP file and then open the compressed file. It's all rather transparent.
I want to store these files in Amazon S3 as compressed files. I can fetch ranges of S3 files, so it should be possible to fetch the ZIP central directory (it's the end of the file, so I can just read the last 64KiB), find the component I want, download that, and stream directly to the calling process.
So my question is, how do I do that through the standard Python ZipFile API? It isn't documented how to replace the filesystem transport with an arbitrary object that supports POSIX semantics. Is this possible without rewriting the module?

Here's an approach which does not need to fetch the entire file (full version available here).
It does require boto (or boto3), though (unless you can mimic the ranged GETs via AWS CLI; which I guess is quite possible as well).
import sys
import zlib
import zipfile
import io
import boto
from boto.s3.connection import OrdinaryCallingFormat
# range-fetches a S3 key
def fetch(key, start, len):
end = start + len - 1
return key.get_contents_as_string(headers={"Range": "bytes=%d-%d" % (start, end)})
# parses 2 or 4 little-endian bits into their corresponding integer value
def parse_int(bytes):
val = ord(bytes[0]) + (ord(bytes[1]) << 8)
if len(bytes) > 3:
val += (ord(bytes[2]) << 16) + (ord(bytes[3]) << 24)
return val
"""
bucket: name of the bucket
key: path to zipfile inside bucket
entry: pathname of zip entry to be retrieved (path/to/subdir/file.name)
"""
# OrdinaryCallingFormat prevents certificate errors on bucket names with dots
# https://stackoverflow.com/questions/51604689/read-zip-files-from-amazon-s3-using-boto3-and-python#51605244
_bucket = boto.connect_s3(calling_format=OrdinaryCallingFormat()).get_bucket(bucket)
_key = _bucket.get_key(key)
# fetch the last 22 bytes (end-of-central-directory record; assuming the comment field is empty)
size = _key.size
eocd = fetch(_key, size - 22, 22)
# start offset and size of the central directory
cd_start = parse_int(eocd[16:20])
cd_size = parse_int(eocd[12:16])
# fetch central directory, append EOCD, and open as zipfile!
cd = fetch(_key, cd_start, cd_size)
zip = zipfile.ZipFile(io.BytesIO(cd + eocd))
for zi in zip.filelist:
if zi.filename == entry:
# local file header starting at file name length + file content
# (so we can reliably skip file name and extra fields)
# in our "mock" zipfile, `header_offset`s are negative (probably because the leading content is missing)
# so we have to add to it the CD start offset (`cd_start`) to get the actual offset
file_head = fetch(_key, cd_start + zi.header_offset + 26, 4)
name_len = parse_int(file_head[0:2])
extra_len = parse_int(file_head[2:4])
content = fetch(_key, cd_start + zi.header_offset + 30 + name_len + extra_len, zi.compress_size)
# now `content` has the file entry you were looking for!
# you should probably decompress it in context before passing it to some other program
if zi.compress_type == zipfile.ZIP_DEFLATED:
print zlib.decompressobj(-15).decompress(content)
else:
print content
break
In your case you might need to write the fetched content to a local file (due to large size), unless memory usage is not a concern.

So here is the code that allows you to open a file on Amazon S3 as if it were a normal file. Notice I use the aws command, rather than the boto3 Python module. (I don't have access to boto3.) You can open the file and seek on it. The file is cached locally. If you open the file with the Python ZipFile API and it's a ZipFile, you can then read individual parts. You can't write, though, because S3 doesn't support partial writes.
Separately, I implement s3open(), which can open a file for reading or writing, but it doesn't implement the seek interface, which is required by ZipFile.
from urllib.parse import urlparse
from subprocess import run,Popen,PIPE
import copy
import json
import os
import tempfile
# Tools for reading and write files from Amazon S3 without boto or boto3
# http://boto.cloudhackers.com/en/latest/s3_tut.html
# but it is easier to use the aws cli, since it's configured to work.
def s3open(path, mode="r", encoding=None):
"""
Open an s3 file for reading or writing. Can handle any size, but cannot seek.
We could use boto.
http://boto.cloudhackers.com/en/latest/s3_tut.html
but it is easier to use the aws cli, since it is present and more likely to work.
"""
from subprocess import run,PIPE,Popen
if "b" in mode:
assert encoding == None
else:
if encoding==None:
encoding="utf-8"
assert 'a' not in mode
assert '+' not in mode
if "r" in mode:
p = Popen(['aws','s3','cp',path,'-'],stdout=PIPE,encoding=encoding)
return p.stdout
elif "w" in mode:
p = Popen(['aws','s3','cp','-',path],stdin=PIPE,encoding=encoding)
return p.stdin
else:
raise RuntimeError("invalid mode:{}".format(mode))
CACHE_SIZE=4096 # big enough for front and back caches
MAX_READ=65536*16
debug=False
class S3File:
"""Open an S3 file that can be seeked. This is done by caching to the local file system."""
def __init__(self,name,mode='rb'):
self.name = name
self.url = urlparse(name)
if self.url.scheme != 's3':
raise RuntimeError("url scheme is {}; expecting s3".format(self.url.scheme))
self.bucket = self.url.netloc
self.key = self.url.path[1:]
self.fpos = 0
self.tf = tempfile.NamedTemporaryFile()
cmd = ['aws','s3api','list-objects','--bucket',self.bucket,'--prefix',self.key,'--output','json']
data = json.loads(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
file_info = data['Contents'][0]
self.length = file_info['Size']
self.ETag = file_info['ETag']
# Load the caches
self.frontcache = self._readrange(0,CACHE_SIZE) # read the first 1024 bytes and get length of the file
if self.length > CACHE_SIZE:
self.backcache_start = self.length-CACHE_SIZE
if debug: print("backcache starts at {}".format(self.backcache_start))
self.backcache = self._readrange(self.backcache_start,CACHE_SIZE)
else:
self.backcache = None
def _readrange(self,start,length):
# This is gross; we copy everything to the named temporary file, rather than a pipe
# because the pipes weren't showing up in /dev/fd/?
# We probably want to cache also... That's coming
cmd = ['aws','s3api','get-object','--bucket',self.bucket,'--key',self.key,'--output','json',
'--range','bytes={}-{}'.format(start,start+length-1),self.tf.name]
if debug:print(cmd)
data = json.loads(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
if debug:print(data)
self.tf.seek(0) # go to the beginning of the data just read
return self.tf.read(length) # and read that much
def __repr__(self):
return "FakeFile<name:{} url:{}>".format(self.name,self.url)
def read(self,length=-1):
# If length==-1, figure out the max we can read to the end of the file
if length==-1:
length = min(MAX_READ, self.length - self.fpos + 1)
if debug:
print("read: fpos={} length={}".format(self.fpos,length))
# Can we satisfy from the front cache?
if self.fpos < CACHE_SIZE and self.fpos+length < CACHE_SIZE:
if debug:print("front cache")
buf = self.frontcache[self.fpos:self.fpos+length]
self.fpos += len(buf)
if debug:print("return 1: buf=",buf)
return buf
# Can we satisfy from the back cache?
if self.backcache and (self.length - CACHE_SIZE < self.fpos):
if debug:print("back cache")
buf = self.backcache[self.fpos - self.backcache_start:self.fpos - self.backcache_start + length]
self.fpos += len(buf)
if debug:print("return 2: buf=",buf)
return buf
buf = self._readrange(self.fpos, length)
self.fpos += len(buf)
if debug:print("return 3: buf=",buf)
return buf
def seek(self,offset,whence=0):
if debug:print("seek({},{})".format(offset,whence))
if whence==0:
self.fpos = offset
elif whence==1:
self.fpos += offset
elif whence==2:
self.fpos = self.length + offset
else:
raise RuntimeError("whence={}".format(whence))
if debug:print(" ={} (self.length={})".format(self.fpos,self.length))
def tell(self):
return self.fpos
def write(self):
raise RuntimeError("Write not supported")
def flush(self):
raise RuntimeError("Flush not supported")
def close(self):
return

Here's an improved version of the already given solution - now it uses boto3 and handles files which are larger than 4GiB:
import boto3
import io
import struct
import zipfile
s3 = boto3.client('s3')
EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20
MAX_STANDARD_ZIP_SIZE = 4_294_967_295
def lambda_handler(event):
bucket = event['bucket']
key = event['key']
zip_file = get_zip_file(bucket, key)
print_zip_content(zip_file)
def get_zip_file(bucket, key):
file_size = get_file_size(bucket, key)
eocd_record = fetch(bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)
if file_size <= MAX_STANDARD_ZIP_SIZE:
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
central_directory = fetch(bucket, key, cd_start, cd_size)
return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
else:
zip64_eocd_record = fetch(bucket, key,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
ZIP64_EOCD_RECORD_SIZE)
zip64_eocd_locator = fetch(bucket, key,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
ZIP64_EOCD_LOCATOR_SIZE)
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
central_directory = fetch(bucket, key, cd_start, cd_size)
return zipfile.ZipFile(io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record))
def get_file_size(bucket, key):
head_response = s3.head_object(Bucket=bucket, Key=key)
return head_response['ContentLength']
def fetch(bucket, key, start, length):
end = start + length - 1
response = s3.get_object(Bucket=bucket, Key=key, Range="bytes=%d-%d" % (start, end))
return response['Body'].read()
def get_central_directory_metadata_from_eocd(eocd):
cd_size = parse_little_endian_to_int(eocd[12:16])
cd_start = parse_little_endian_to_int(eocd[16:20])
return cd_start, cd_size
def get_central_directory_metadata_from_eocd64(eocd64):
cd_size = parse_little_endian_to_int(eocd64[40:48])
cd_start = parse_little_endian_to_int(eocd64[48:56])
return cd_start, cd_size
def parse_little_endian_to_int(little_endian_bytes):
format_character = "i" if len(little_endian_bytes) == 4 else "q"
return struct.unpack("<" + format_character, little_endian_bytes)[0]
def print_zip_content(zip_file):
files = [zi.filename for zi in zip_file.filelist]
print(f"Files: {files}")

import io
class S3File(io.RawIOBase):
def __init__(self, s3_object):
self.s3_object = s3_object
self.position = 0
def __repr__(self):
return "<%s s3_object=%r>" % (type(self).__name__, self.s3_object)
#property
def size(self):
return self.s3_object.content_length
def tell(self):
return self.position
def seek(self, offset, whence=io.SEEK_SET):
if whence == io.SEEK_SET:
self.position = offset
elif whence == io.SEEK_CUR:
self.position += offset
elif whence == io.SEEK_END:
self.position = self.size + offset
else:
raise ValueError("invalid whence (%r, should be %d, %d, %d)" % (
whence, io.SEEK_SET, io.SEEK_CUR, io.SEEK_END
))
return self.position
def seekable(self):
return True
def read(self, size=-1):
if size == -1:
# Read to the end of the file
range_header = "bytes=%d-" % self.position
self.seek(offset=0, whence=io.SEEK_END)
else:
new_position = self.position + size
# If we're going to read beyond the end of the object, return
# the entire object.
if new_position >= self.size:
return self.read()
range_header = "bytes=%d-%d" % (self.position, new_position - 1)
self.seek(offset=size, whence=io.SEEK_CUR)
return self.s3_object.get(Range=range_header)["Body"].read()
def readable(self):
return True
if __name__ == "__main__":
import zipfile
import boto3
s3 = boto3.resource("s3")
s3_object = s3.Object(bucket_name="bukkit", key="bagit.zip")
s3_file = S3File(s3_object)
with zipfile.ZipFile(s3_file) as zf:
print(zf.namelist())
Reference:
https://alexwlchan.net/2019/02/working-with-large-s3-objects/

Should I be worried about FFT being off by a factor of 6

Everything seems to be working fine when I am trying to turn a wav file into a dataset (csv file) When I attempt to FFT it, it seems to work fine with the exception of it being off by a factor of 6, is this something I ought to worry about?
Furthermore another question I have is regarding a wav file, so I found the data inside and got ints out of them. They are 2 byte signed shorts, so I am aware that they have limits of -32767-32767 and are the amplitude. However I am confused on how do these convert into actual volume?
Code
import pyaudio
import wave
import time
import sys
import struct
import csv
# all files and stuffs need to contain their tags ('potatos.wav')
def wavToData(wavName):
stream = wave.open(wavName)
num_channels = stream.getnchannels()
sample_rate = stream.getframerate()
sample_width = stream.getsampwidth()
num_frames = stream.getnframes()
raw_data = stream.readframes( num_frames ) # Returns byte data
stream.close()
total_samples = num_frames * num_channels
if sample_width == 1:
fmt = "%iB" % total_samples # read unsigned chars
elif sample_width == 2:
fmt = "%ih" % total_samples # read signed 2 byte shorts
else:
raise ValueError("Only supports 8 and 16 bit audio formats.")
total_samples = num_frames * num_channels
integer_data = struct.unpack(fmt, raw_data) # this is amplitude
return integer_data, num_frames, sample_rate
def dataToWav(wavName,data):
# IMPORTANT hard coded standard values, refenced from the audiocheck.net files
stream = wave.open(wavName,"wb")
stream.setnchannels(1)
stream.setframerate(44100)
stream.setsampwidth(2)
numFrames = len(data)
stream.setnframes(numFrames)
fmt = "%ih" % numFrames
data = struct.pack(fmt,*data) #unpacks all
stream.writeframes(data)
def wavToCsv(wavName,csvName):
with open(csvName, 'wb') as csvfile: #contain all data in 1 row
# x = time (1 frame)
# y = freq
filewriter = csv.writer(csvfile, delimiter=',',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
#get data
integer_data, num_frames, sample_rate = wavToData(wavName)
#adds headings and other needed data
frameCount = [x+1 for x in range(num_frames)] # x
fullData = [x for x in integer_data] # y
frameCount.insert(0, "Time (increments of %f)"%(1/sample_rate))
fullData.insert(0,"Frequency(dBfs)")
fullData = frameCount+fullData
filewriter.writerow(fullData)
def mergeWavData(wav1,wav2,mergeWav): #generates a brand new wav file
firstData = wavToData(wav1)[0]
secondData = wavToData(wav2)[0]
limitCheck = lambda x,y: ((x+y)> 32767 or (x+y) < -32767)
limitConstrain = lambda x,y: 32767 if (x+y) >32767 else -32767
newData = []
if len(firstData) < len(secondData): #adds the amplitudes of both wav file up to the shortest one
for i in range(len(firstData)):
data = 0
if(limitCheck(firstData[i],secondData[i])): #exceed limit
data = limitConstrain(firstData[i],secondData[i])
else:
data = firstData[i]+secondData[i]
newData.append(data)
else:
for i in range(len(secondData)):
data = 0
if(limitCheck(firstData[i],secondData[i])): #exceed limit
data = limitConstrain(firstData[i],secondData[i])
else:
data = firstData[i]+secondData[i]
if (data > 32767 or data <-32767):
print data
newData.append(data)
dataToWav(mergeWav,newData)
mergeWavData('audiocheck.net_sin_1000Hz_-3dBFS_3s.wav','audiocheck.net_sin_4000Hz_-3dBFS_3s.wav','1000Hz+4000Hz.wav')
wavToCsv('1000Hz+4000Hz.wav','1000Hz+4000Hz.csv') # maeks csv out of wav ayayayay
#include tags (.wav/.csv)
import pyaudio
import wave
import time
import sys
import struct
import csv
# all files and stuffs need to contain their tags ('potatos.wav')
def wavToData(wavName):
stream = wave.open(wavName)
num_channels = stream.getnchannels()
sample_rate = stream.getframerate()
sample_width = stream.getsampwidth()
num_frames = stream.getnframes()
raw_data = stream.readframes( num_frames ) # Returns byte data
stream.close()
total_samples = num_frames * num_channels
if sample_width == 1:
fmt = "%iB" % total_samples # read unsigned chars
elif sample_width == 2:
fmt = "%ih" % total_samples # read signed 2 byte shorts
else:
raise ValueError("Only supports 8 and 16 bit audio formats.")
total_samples = num_frames * num_channels
integer_data = struct.unpack(fmt, raw_data) # this is amplitude
return integer_data, num_frames, sample_rate
def dataToWav(wavName,data):
# IMPORTANT hard coded standard values, refenced from the audiocheck.net files
stream = wave.open(wavName,"wb")
stream.setnchannels(1)
stream.setframerate(44100)
stream.setsampwidth(2)
numFrames = len(data)
stream.setnframes(numFrames)
fmt = "%ih" % numFrames
data = struct.pack(fmt,*data) #unpacks all
stream.writeframes(data)
def wavToCsv(wavName,csvName):
with open(csvName, 'wb') as csvfile: #contain all data in 1 row
# x = time (1 frame)
# y = freq
filewriter = csv.writer(csvfile, delimiter=',',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
#get data
integer_data, num_frames, sample_rate = wavToData(wavName)
#adds headings and other needed data
frameCount = [x+1 for x in range(num_frames)] # x
fullData = [x for x in integer_data] # y
frameCount.insert(0, "Time (increments of %f)"%(1/sample_rate))
fullData.insert(0,"Frequency(dBfs)")
fullData = frameCount+fullData
filewriter.writerow(fullData)
def mergeWavData(wav1,wav2,mergeWav): #generates a brand new wav file
firstData = wavToData(wav1)[0]
secondData = wavToData(wav2)[0]
limitCheck = lambda x,y: ((x+y)> 32767 or (x+y) < -32767)
limitConstrain = lambda x,y: 32767 if (x+y) >32767 else -32767
newData = []
if len(firstData) < len(secondData): #adds the amplitudes of both wav file up to the shortest one
for i in range(len(firstData)):
data = 0
if(limitCheck(firstData[i],secondData[i])): #exceed limit
data = limitConstrain(firstData[i],secondData[i])
else:
data = firstData[i]+secondData[i]
newData.append(data)
else:
for i in range(len(secondData)):
data = 0
if(limitCheck(firstData[i],secondData[i])): #exceed limit
data = limitConstrain(firstData[i],secondData[i])
else:
data = firstData[i]+secondData[i]
if (data > 32767 or data <-32767):
print data
newData.append(data)
dataToWav(mergeWav,newData)
mergeWavData('audiocheck.net_sin_1000Hz_-3dBFS_3s.wav','audiocheck.net_sin_4000Hz_-3dBFS_3s.wav','1000Hz+4000Hz.wav')
wavToCsv('1000Hz+4000Hz.wav','1000Hz+4000Hz.csv') # maeks csv out of wav ayayayay
#include tags (.wav/.csv)

File path artifacts when reading target path from a shortcut(.lnk) in python?

When testing the code from this answer, I do receive an output, however it seems to have some artifacts...
This is the code I tested:
import locale
import struct
def __readLink(path):
target = ''
try:
with open(path, 'rb') as stream:
content = stream.read()
# skip first 20 bytes (HeaderSize and LinkCLSID)
# read the LinkFlags structure (4 bytes)
lflags = struct.unpack('I', content[0x14:0x18])[0]
position = 0x18
# if the HasLinkTargetIDList bit is set then skip the stored IDList
# structure and header
if (lflags & 0x01) == 1:
position = struct.unpack('H', content[0x4C:0x4E])[0] + 0x4E
last_pos = position
position += 0x04
# get how long the file information is (LinkInfoSize)
length = struct.unpack('I', content[last_pos:position])[0]
# skip 12 bytes (LinkInfoHeaderSize, LinkInfoFlags, and VolumeIDOffset)
position += 0x0C
# go to the LocalBasePath position
lbpos = struct.unpack('I', content[position:position+0x04])[0]
position = last_pos + lbpos
# read the string at the given position of the determined length
size= (length + last_pos) - position - 0x02
temp = struct.unpack('c' * size, content[position:position+size])
target = ''.join([chr(ord(a)) for a in temp])
except:
# could not read the file
pass
return target
print(__readLink('test.lnk'))
The output is linked below, as for some reason, it does not copy completely.
Another problem I see is that it does not output the full file extension? It should be an .mp4, but it terminates at ".mp"
Image of output

The Code from the Question does not follow the MS-SHLLINK specification,
there are to many fixed Offset Values.
MS-SHLLINK: Shell Link (.LNK) Binary File Format
Specifies the Shell Link Binary File Format, which contains information that can be used to access another data object. The Shell Link Binary File Format is the format of Windows files with the extension "LNK".
The following code uses only LinkFlags 0x14 and LinkTargetIDList 0x4c as fixed Offsets.
def LocalBasePath(path):
def unpack(offset, size):
m = 'I'
if size == 2: m = 'H'
return struct.unpack(m, content[offset:offset+size])[0]
target = ''
try:
with open(path, 'rb') as fh:
content = fh.read()
# Read the LinkFlags 4 bytes
LinkFlags = unpack(0x14, 4)
position = 0x4c
# Skip LinkTargetIDList if HasLinkTargetIDList
if (LinkFlags & 0x01) == 1:
position += unpack(position, 2)
position += 0x02 # TerminalID 2 bytes
LinkInfo_offset = position
LinkInfoSize = unpack(position, 4)
# Skip 4 * 4 bytes in LinkInfo
LokalBasePathOffset = LinkInfo_offset + (4 * 4)
LocalBasePath = LinkInfo_offset + unpack(LokalBasePathOffset, 4)
# Read LocalBasePath String
size = ((LinkInfo_offset + LinkInfoSize) - LocalBasePath) -2
target = ''.join([chr(ord(a)) for a in struct.unpack('c' * size, content[LocalBasePath:LocalBasePath+size])])
except Exception as exp:
print(exp)
return target
Tested with Python: 3.4.2

Python 3 - Can pickle handle byte objects larger than 4GB?

Based on this comment and the referenced documentation, Pickle 4.0+ from Python 3.4+ should be able to pickle byte objects larger than 4 GB.
However, using python 3.4.3 or python 3.5.0b2 on Mac OS X 10.10.4, I get an error when I try to pickle a large byte array:
>>> import pickle
>>> x = bytearray(8 * 1000 * 1000 * 1000)
>>> fp = open("x.dat", "wb")
>>> pickle.dump(x, fp, protocol = 4)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
OSError: [Errno 22] Invalid argument
Is there a bug in my code or am I misunderstanding the documentation?

Here is a simple workaround for issue 24658. Use pickle.loads or pickle.dumps and break the bytes object into chunks of size 2**31 - 1 to get it in or out of the file.
import pickle
import os.path
file_path = "pkl.pkl"
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)
## write
bytes_out = pickle.dumps(data)
with open(file_path, 'wb') as f_out:
for idx in range(0, len(bytes_out), max_bytes):
f_out.write(bytes_out[idx:idx+max_bytes])
## read
bytes_in = bytearray(0)
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
for _ in range(0, input_size, max_bytes):
bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)
assert(data == data2)

To sum up what was answered in the comments:
Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).

Here is the full workaround, though it seems pickle.load no longer tries to dump a huge file anymore (I am on Python 3.5.2) so strictly speaking only the pickle.dumps needs this to work properly.
import pickle
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
# print("reading total_bytes=%s" % n, flush=True)
if n >= (1 << 31):
buffer = bytearray(n)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
# print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
buffer[idx:idx + batch_size] = self.f.read(batch_size)
# print("done.", flush=True)
idx += batch_size
return buffer
return self.f.read(n)
def write(self, buffer):
n = len(buffer)
print("writing total_bytes=%s..." % n, flush=True)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
self.f.write(buffer[idx:idx + batch_size])
print("done.", flush=True)
idx += batch_size
def pickle_dump(obj, file_path):
with open(file_path, "wb") as f:
return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_load(file_path):
with open(file_path, "rb") as f:
return pickle.load(MacOSFile(f))

You can specify the protocol for the dump.
If you do pickle.dump(obj,file,protocol=4) it should work.

Reading a file by 2GB chunks takes twice as much memory as needed if bytes concatenation is performed, my approach to loading pickles is based on bytearray:
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
if n >= (1 << 31):
buffer = bytearray(n)
pos = 0
while pos < n:
size = min(n - pos, 1 << 31 - 1)
chunk = self.f.read(size)
buffer[pos:pos + size] = chunk
pos += size
return buffer
return self.f.read(n)
Usage:
with open("/path", "rb") as fin:
obj = pickle.load(MacOSFile(fin))

Had the same issue and fixed it by upgrading to Python 3.6.8.
This seems to be the PR that did it: https://github.com/python/cpython/pull/9937

I also found this issue, to solve this problem i chunk the code into several iteration. Let say in this case i have 50.000 data which i have to calc tf-idf and do knn classfication. When i run and directly iterate 50.000 it give me "that error". So, to solve this problem i chunk it.
tokenized_documents = self.load_tokenized_preprocessing_documents()
idf = self.load_idf_41227()
doc_length = len(documents)
for iteration in range(0, 9):
tfidf_documents = []
for index in range(iteration, 4000):
doc_tfidf = []
for term in idf.keys():
tf = self.term_frequency(term, tokenized_documents[index])
doc_tfidf.append(tf * idf[term])
doc = documents[index]
tfidf = [doc_tfidf, doc[0], doc[1]]
tfidf_documents.append(tfidf)
print("{} from {} document {}".format(index, doc_length, doc[0]))
self.save_tfidf_41227(tfidf_documents, iteration)

compute crc of file in python

I want to calculate the CRC of file and get output like: E45A12AC. Here's my code:
#!/usr/bin/env python
import os, sys
import zlib
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
for eachLine in content:
zlib.crc32(eachLine)
for eachFile in sys.argv[1:]:
crc(eachFile)
This calculates the CRC for each line, but its output (e.g. -1767935985) is not what I want.
Hashlib works the way I want, but it computes the md5:
import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
m.update(line)
print m.hexdigest()
Is it possible to get something similar using zlib.crc32?

A little more compact and optimized code
def crc(fileName):
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
PS2: Old PS is deprecated - therefore deleted -, because of the suggestion in the comment. Thank you. I don't get, how I missed this, but it was really good.

A modified version of kobor42's answer, with performance improved by a factor 2-3 by reading fixed size chunks instead of "lines":
import zlib
def crc32(fileName):
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
Also includes leading zeroes in the returned string.

hashlib-compatible interface for CRC-32 support:
import zlib
class crc32(object):
name = 'crc32'
digest_size = 4
block_size = 1
def __init__(self, arg=''):
self.__digest = 0
self.update(arg)
def copy(self):
copy = super(self.__class__, self).__new__(self.__class__)
copy.__digest = self.__digest
return copy
def digest(self):
return self.__digest
def hexdigest(self):
return '{:08x}'.format(self.__digest)
def update(self, arg):
self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff
# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32
# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')

To show any integer's lowest 32 bits as 8 hexadecimal digits, without sign, you can "mask" the value by bit-and'ing it with a mask made of 32 bits all at value 1, then apply formatting. I.e.:
>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'
It's quite irrelevant whether the integer you are thus formatting comes from zlib.crc32 or any other computation whatsoever.

Python 3.8+ (using the walrus operator):
import zlib
def crc32(filename, chunksize=65536):
"""Compute the CRC-32 checksum of the contents of the given filename"""
with open(filename, "rb") as f:
checksum = 0
while (chunk := f.read(chunksize)) :
checksum = zlib.crc32(chunk, checksum)
return checksum
chunksize is how many bytes to read from the file at a time. You will get the same CRC for the same file no matter what you set chunksize to (it has to be > 0), but setting it too low might make your code slow, too high might use too much memory.
The result is a 32 bit integer. The CRC-32 checksum of an empty file is 0.

Edited to include Altren's solution below.
A modified and more compact version of CrouZ's answer, with a slightly improved performance, using a for loop and file buffering:
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
Results, in a 6700k, HDD:
(Note: Retested multiple times and it was consistently faster.)
Warming up the machine...
Finished.
Beginning tests...
File size: 90288KB
Test cycles: 500
With for loop and buffer.
Result 45.24728019630359
CrouZ solution
Result 45.433838356097894
kobor42 solution
Result 104.16215688703986
Altren solution
Result 101.7247863946586
Tested in Python 3.6.4 x64 using the script below:
import os, timeit, zlib, random, binascii
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
def crc32(fileName):
"""CrouZ solution"""
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
def crc(fileName):
"""kobor42 solution"""
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
def crc32altren(filename):
"""Altren solution"""
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash
fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.',
crc32: 'CrouZ solution', crc: 'kobor42 solution',
crc32altren: 'Altren solution'}
count = 500
# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
randomItm[0](fpath)
print('Finished.\n')
# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
os.stat(fpath).st_size/1024, count))
for x in tests:
print(tests[x])
start_time = timeit.default_timer()
for c in range(count):
x(fpath)
print('Result', timeit.default_timer() - start_time, '\n')
It is faster because for loops are faster than while loops (sources: here and here).

Merge the above 2 codes as below:
try:
fd = open(decompressedFile,"rb")
except IOError:
logging.error("Unable to open the file in readmode:" + decompressedFile)
return 4
eachLine = fd.readline()
prev = 0
while eachLine:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()

There is faster and more compact way to compute CRC using binascii:
import binascii
def crc32(filename):
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash

You can use base64 for getting out like [ERD45FTR]. And zlib.crc32 provides update options.import os, sys
import zlib
import base64
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
prev = None
for eachLine in content:
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
return prev
for eachFile in sys.argv[1:]:
print base64.b64encode(str(crc(eachFile)))

solution:
import os, sys
import zlib
def crc(fileName, excludeLine="", includeLine=""):
try:
fd = open(fileName,"rb")
except IOError:
print "Unable to open the file in readmode:", filename
return
eachLine = fd.readline()
prev = None
while eachLine:
if excludeLine and eachLine.startswith(excludeLine):
continue
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()
return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc
for eachFile in sys.argv[1:]:
print crc(eachFile)
don't realy know for what is (excludeLine="", includeLine="")...

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to zip a very large file in python - python

Related

Read ZIP files from S3 without downloading the entire file

Should I be worried about FFT being off by a factor of 6

File path artifacts when reading target path from a shortcut(.lnk) in python?

Python 3 - Can pickle handle byte objects larger than 4GB?

compute crc of file in python

Categories

Resources