Memory error while downloading large Gzip files and decompressing them - python

I am trying to download a dataset from https://datasets.imdbws.com/title.principals.tsv.gz, decompress the contents in my code itself(Python)and write the resulting file(s) onto disk.
To do so I am using the following code snippet.
results = requests.get(config[sourceFiles]['url'])
with open(config[sourceFiles]['downloadLocation']+config[sourceFiles]['downloadFileName'], 'wb') as f_out:
print(config[sourceFiles]['downloadFileName'] + " starting download")
f_out.write(gzip.decompress(results.content))
print(config[sourceFiles]['downloadFileName']+" downloaded successfully")
This code works fine for most zip files however for larger files it gives the following error message.
File "C:\Users\****\AppData\Local\Programs\Python\Python37-32\lib\gzip.py", line 532, in decompress
return f.read()
File "C:\Users\****\AppData\Local\Programs\Python\Python37-32\lib\gzip.py", line 276, in read
return self._buffer.read(size)
File "C:\Users\****\AppData\Local\Programs\Python\Python37-32\lib\gzip.py", line 471, in read
uncompress = self._decompressor.decompress(buf, size)
MemoryError
Is there a way to accomplish this without having to download the zip file directly onto disk and decompressing it for actual data.

You can use a streaming request coupled with zlib:
import zlib
import requests
url = 'https://datasets.imdbws.com/title.principals.tsv.gz'
result = requests.get(url, stream=True)
f_out = open("result.txt", "wb")
chunk_size = 1024 * 1024
d = zlib.decompressobj(zlib.MAX_WBITS|32)
for chunk in result.iter_content(chunk_size):
buffer = d.decompress(chunk)
f_out.write(buffer)
buffer = d.flush()
f_out.write(buffer)
f_out.close()
This snippet reads the data chunk by chunk and feeds it to zlib which can handle data streams.
Depending on your connection speed and CPU/disk performance you can test various chunk sizes.

Related

Stream huge zip files on S3 using Lambda and boto3

I have a bunch of CSV files compressed as one zip on S3. I only need to process one CSV file inside the zip using AWS lambda function
import boto3
from zipfile import ZipFile
BUCKET = 'my-bucket'
s3_rsc = boto3.resource('s3')
def zip_stream(zip_f='app.zip', bkt=BUCKET, rsc=s3_rsc):
obj = rsc.Object(
bucket_name=bkt,
key=zip_f
)
return ZipFile(BytesIO(obj.get()['Body'].read()))
zip_obj = zip_stream()
csv_dat = zip_obj.read('one.csv')
The above snippet works well with test zip files, however, it fails with memory error if the zip file size exceeds 0.5G.
Error Message
{ "errorMessage": "", "errorType": "MemoryError", "stackTrace":
[
" File "/var/task/lambda_function.py", line 12, in handler\n all_files = files_in_zip()\n",
" File "/var/task/lambda_function.py", line 36, in files_in_zip\n zippo = zip_stream()\n",
" File "/var/task/lambda_function.py", line 32, in zip_stream\n return ZipFile(BytesIO(obj.get()['Body'].read()))\n",
" File "/var/runtime/botocore/response.py", line 77, in read\n chunk = self._raw_stream.read(amt)\n",
" File "/var/runtime/urllib3/response.py", line 515, in read\n data = self._fp.read() if not fp_closed else b""\n",
" File "/var/lang/lib/python3.8/http/client.py", line 468, in read\n s = self._safe_read(self.length)\n",
" File "/var/lang/lib/python3.8/http/client.py", line 609, in _safe_read\n data = self.fp.read(amt)\n" ] }
Is there an option to stream/lazyload the zipfile to mitigate memory issues?
Note - I also referred an old post(How can I use boto to stream a file out of Amazon S3 to Rackspace Cloudfiles?) which spoke about streaming a file but not zip
Depending on your exact needs, you can use smart-open to handle the reading of the zip File. If you can fit the CSV data in RAM in your Lambda, it's fairly straightforward to call directly:
from smart_open import smart_open
from io import TextIOWrapper, BytesIO
def lambda_handler(event, context):
# Simple test, just calculate the sum of the first column of a CSV file in a Zip file
total_sum, row_count = 0, 0
# Use smart open to handle the byte range requests for us
with smart_open("s3://example-bucket/many_csvs.zip", "rb") as f:
# Wrap that in a zip file handler
zip = zipfile.ZipFile(f)
# Open a specific CSV file in the zip file
zf = zip.open("data_101.csv")
# Read all of the data into memory, and prepare a text IO wrapper to read it row by row
text = TextIOWrapper(BytesIO(zf.read()))
# And finally, use python's csv library to parse the csv format
cr = csv.reader(text)
# Skip the header row
next(cr)
# Just loop through each row and add the first column
for row in cr:
total_sum += int(row[0])
row_count += 1
# And output the results
print(f"Sum {row_count} rows for col 0: {total_sum}")
I tested this with a 1gb zip file containing hundreds of CSV files. The CSV file I picked was around 12mb uncompressed, or 100,000 rows, so it felt nicely into RAM in the Lambda environment, even when limited to 128mb of RAM.
If your CSV file can't be loaded at once like this, you'll need to take care to load it in sections, buffering the reads so you don't waste time reading it line-by-line and forcing smart-open to load small chunks at a time.

Python convert zip file to bytes stream

I have a zip file that when I open it locally it looks great. I want to convert it to a bytes stream buffer and then return it as HttpResponse(buffer) using django. The code is,
studies_zip = zipfile.ZipFile('./studies.zip', 'r')
buffer = io.BytesIO()
bytes = [zipfile.Path(studies_zip, at=file.filename).read_bytes()
for file in studies_zip.infolist()]
buffer = io.BytesIO()
buffer_writer = io.BufferedWriter(buffer)
[buffer_writer.write(b) for b in bytes]
buffer.seek(0)
response = HttpResponse(buffer)
response['Content-Type'] = 'application/zip'
response['Content-Disposition'] = 'attachment;filename=studies.zip'
At the front-end/UI I get this,
that looks fine i.e. the displayed size of 34.9MB is a little bit less than the actual 36.6MB. Also, when I try to open the file either on the spot or after saving it locally, I get
What's wrong?
You are sending the contents of the compressed files, omitting the metadata contained in the zip archive.
There is no reason to open the file as a zip file as no changes are made to the contents, so just open the file in byes mode and send it. I haven't tested this, but try this:
with open('./studies.zip', 'rb') as f:
response = HttpResponse(f)
response['Content-Type'] = 'application/zip'
response['Content-Disposition'] = 'attachment;filename=studies.zip'

Stream a large file from URL straight into a gzip file

I want to stream a large file into a gzip file directly, instead of downloading it all into memory and then compressing. This is how far I have gotten (does not work). I know how to just download a file in python and save and I know how to compress one, it is the streaming part that does not work.
Note: this linked csv is not large, it is just an example url.
import requests
import zlib
url = f"http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv"
with requests.get(url, stream=True) as r:
compressor = zlib.compressobj()
with open(save_file_path, 'wb') as f:
f.write(compressor.compress(r.raw))
Alright I figured it out:
with requests.get(url, stream=True, verify=False) as r:
if save_file_path.endswith('gz'):
compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
with open(save_file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
f.write(compressor.compress(chunk))
f.write(compressor.flush())
else:
with open(save_file_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)

Download a gzipped file, md5 checksum it, and then save extracted data if matches

I'm currently attempting to download two files using Python, one a gzipped file, and the other, its checksum.
I would like to verify that the gzipped file's contents match the md5 checksum, and then I would like to save the contents to a target directory.
I found out how to download the files here, and I learned how to calculate the checksum here. I load the URLs from a JSON config file, and I learned how to parse JSON file values here.
I put it all together into the following script, but I'm stuck attempting to store the verified contents of the gzipped file.
import json
import gzip
import urllib
import hashlib
# Function for creating an md5 checksum of a file
def md5Gzip(fname):
hash_md5 = hashlib.md5()
with gzip.open(fname, 'rb') as f:
# Make an iterable of the file and divide into 4096 byte chunks
# The iteration ends when we hit an empty byte string (b"")
for chunk in iter(lambda: f.read(4096), b""):
# Update the MD5 hash with the chunk
hash_md5.update(chunk)
return hash_md5.hexdigest()
# Open the configuration file in the current directory
with open('./config.json') as configFile:
data = json.load(configFile)
# Open the downloaded checksum file
with open(urllib.urlretrieve(data['checksumUrl'])[0]) as checksumFile:
md5Checksum = checksumFile.read()
# Open the downloaded db file and get it's md5 checksum via gzip.open
fileMd5 = md5Gzip(urllib.urlretrieve(data['fileUrl'])[0])
if (fileMd5 == md5Checksum):
print 'Downloaded Correct File'
# save correct file
else:
print 'Downloaded Incorrect File'
# do some error handling
In your md5Gzip, return a tuple instead of just the hash.
def md5Gzip(fname):
hash_md5 = hashlib.md5()
file_content = None
with gzip.open(fname, 'rb') as f:
# Make an iterable of the file and divide into 4096 byte chunks
# The iteration ends when we hit an empty byte string (b"")
for chunk in iter(lambda: f.read(4096), b""):
# Update the MD5 hash with the chunk
hash_md5.update(chunk)
# get file content
f.seek(0)
file_content = f.read()
return hash_md5.hexdigest(), file_content
Then, in your code:
fileMd5, file_content = md5Gzip(urllib.urlretrieve(data['fileUrl'])[0])

Efficiently read from REST endpoint and compress result in Python

I have a data export job that reads data from a REST endpoint and then saves the data in a temporary compressed file before being written to S3. This was working for smaller payloads:
import gzip
import urllib2
# Fails when writing too much data at once
def get_data(url, params, fileobj):
request = urllib2.urlopen(url, params)
event_data = request.read()
with gzip.open(fileobj.name, 'wb') as f:
f.write(event_data)
However, as the data size increased I got an error that seems to indicate I'm writing too much data at once:
File "/usr/lib64/python2.7/gzip.py", line 241, in write
self.fileobj.write(self.compress.compress(data))
OverflowError: size does not fit in an int
I tried modifying the code to read from the REST endpoint line-by-line and write each line to the file, but this was incredibly slow, probably because the endpoint isn't setup to handle that.
# Incredibly slow
def get_data(url, params, fileobj):
request = urllib2.urlopen(url, params)
with gzip.open(fileobj.name, 'wb') as f:
for line in request:
f.write(line)
Is there a more efficient way to do this, such as by reading the entire payload at once, like in the first example, but then efficiently reading line-by-line from the data now residing in memory?
Turns out this is what StringIO is for. By turning my payload into a StringIO object I was able to read from it line-by-line and write to a gzipped file without any errors.
from StringIO import StringIO
def get_data(url, params, fileobj):
request = urllib2.urlopen(url, params)
event_data = StringIO(request.read())
with gzip.open(fileobj.name, 'wb') as f:
for line in event_data:
f.write(line)

Categories