Memory optimization while downloading and saving data to s3 using python - python

filename ="test.zip"
url = "some url"
data= requests.get(url, stream= True)
f= BytesIO()
f.write(data.content)
try:
s3_r = boto3.resource('s3')
s3_r.Object(bucket,filename).put(body= f.getvalue())
return filename
except:
print('failed')
Note: The zip file is greater than 1GB
my code is working fine but its taking a lot of memory, need to optimize the code that memory usage should not be greater then 200mb
Found below code for Chunk
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
but not able save f.write(chunk) to s3 bucket

Related

Requests + Tqdm to a variable

I got this function right here from this question (ctrl+ f for "There is an answer with requests and tqdm"):
import requests
from tqdm import tqdm
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Basically it downloads a file and writes it to a file, and I wanted it to return a variable that rapresents the downloaded file, so I made this:
def download(url: str, fname: str):
import requests
from tqdm import tqdm
import os
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
with open(fname, "rb") as f:
returned = f.read()
os.remove(fname)
return returned
Now it saves the file, reads it and saves it to a variable, deletes the file and returns the variable.
Is there a way I can save it directly to a variable?
Well, you could just return a tqdm iterator, and do whatever you like with the chunks:
import requests
import tqdm
import io
def download(url: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with tqdm.tqdm(
desc=url,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in resp.iter_content(chunk_size=65536):
bar.update(len(chunk))
yield chunk
bio = io.BytesIO()
for chunk in download('http://...'):
# Do something with the chunk; this just stores it in memory.
bio.write(chunk)
content = bio.getvalue() # Get the contents of the BytesIO() as a bytes.
Of course you can then refactor this to
import requests
import tqdm
import io
def download_as_bytes_with_progress(url: str) -> bytes:
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
bio = io.BytesIO()
with tqdm.tqdm(
desc=url,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in resp.iter_content(chunk_size=65536):
bar.update(len(chunk))
bio.write(chunk)
return bio.getvalue()

Response binary data to download

I am trying to input a file from an input field, save it temporarily to the disk and reply with a response to re-download the same file.
In order to do this, I've read that I need to reply to the browser with a content-type : application/octet-stream and a content-disposition: attachment; "filename=myfile.extension".
I can store and listen to my music file in the /tmp folder so I know that the input part of it works.
This is my code in Pyramid:
#view_config(route_name='process')
def process_file(request):
input_file = request.POST['file'].file
input_file.seek(0)
file_path = os.path.join('/tmp', '%s.mp3' % uuid.uuid4())
with open(file_path, 'wb') as output_file:
shutil.copyfileobj(input_file, output_file)
print(f"Wrote: {file_path}")
filename = file_path.split('/')[-1]
print(filename)
f = open(file_path, 'rb')
return Response(body_file=f, charset='UTF-8', content_type='application/octet-stream', content_disposition=f'attachment; "filename={filename}"')
These are my response headers:
And this is my response body:
However Chrome/Firefox do not start the download of my binary file. What am I doing wrong?
UPDATE
I also tried with FileResponse from Pyramid without success, I still do not get the download popup.
#view_config(route_name='process')
def process_file(request):
input_file = request.POST['file'].file
input_file.seek(0)
file_path = os.path.join('/tmp', '%s.mp3' % uuid.uuid4())
with open(file_path, 'wb') as output_file:
shutil.copyfileobj(input_file, output_file)
print(f"Wrote: {file_path}")
return FileResponse(file_path, request=request)
Apparently I was thinking how to perform this in the wrong way. I need to return a Response('OK') when I upload the file through /process and make another request to return a FileResponse object, building another endpoint /download and returning that fileresponse object fixed this issue.
Example:
#view_config(route_name='process')
def process_file(request):
input_file = request.POST['file'].file
db = request.POST['volume']
input_file.seek(0)
filename = '%s.mp3' % uuid.uuid4()
file_path = os.path.join('/tmp', filename)
with open(file_path, 'wb') as output_file:
shutil.copyfileobj(input_file, output_file)
if boost_track(file_path, filename, db):
return Response(json_body={'filename': filename})
#view_config(route_name='download')
def download_file(request):
filename = request.GET['filename']
file_path = os.path.join('/tmp', filename)
f = open(file_path, 'rb')
return Response(body_file=f, charset='UTF-8', content_type='application/download', content_disposition=f'attachment; filename="{filename}"')

How can I directly fetch the data from URL to Azure Blob

I'm using the requests.get but it is downloading the file on the machine where I executed the file, Is there a way to skip this?
r = requests.get(url,stream=True)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as data:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
data.write(chunk)
block_blob_service.create_blob_from_path(path.join(container,blob),
data.name,
file_name ,
content_settings=ContentSettings(content_type=mimetypes.guess_type('./%s' %url.split("/")[-1])[0]))
Try to use the code below.
r = requests.get(url,stream=True)
block_blob_service.create_blob_from_stream(container_name, blob_name, io.BytesIO(r.content))
Or
r = requests.get(url,stream=True)
block_blob_service.create_blob_from_bytes(container_name, blob_name, r.content)
Hope it helps.

How to use `tqdm` in python to show progress when downloading data online?

I can find some doc explaining how to use tqdm package, but from which I can't figure out how to produce progress meter when downloading data online.
Below is an example code I copied from ResidentMario for downloading data
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filename
dat = download_file("https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD",
"NYPD Motor Vehicle Collisions.csv")
Could anyone show me how to use tqdm package here to show downloading progress?
Thanks
As of now i do something like that:
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
chunkSize = 1024
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
pbar.update (len(chunk))
f.write(chunk)
return filename
pbar.clear() and pbar.close()
Manually update the progress bar, useful for streams such as reading files.
https://github.com/tqdm/tqdm#returns
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm(unit="B", unit_scale=True, unit_divisor=1024, total=int( r.headers['Content-Length'] ))
pbar.clear() # clear 0% info
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
return filename
Thanks to silmaril, but the below works and makes more sense to me.
def download_file(url, filename):
r = requests.get(url, stream=True)
filelength = int(r.headers['Content-Length'])
with open(filename, 'wb') as f:
pbar = tqdm(total=int(filelength/1024))
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update ()
f.write(chunk)

Python: How to download a zip file

I'm attempting to download a zip file using this code:
o = urllib2.build_opener( urllib2.HTTPCookieProcessor() )
#login
p = urllib.urlencode( { usernameField: usernameVal, passField: passVal } )
f = o.open(authUrl, p )
data = f.read()
print data
f.close()
#download file
f = o.open(remoteFileUrl)
localFile = open(localFile, "wb")
localFile.write(f.read())
f.close()
I am getting some binary data, but the size of the file I "downloaded" is too small and is not a valid zip file. Am I not retrieving the zip file properly? The HTTP response header for f = o.open(remoteFileUrl) is shown below. I don't know if special processing is needed to handle this response:
HTTP/1.1 200 OK Server:
Apache-Coyote/1.1 Pragma: private
Cache-Control: must-revalidate
Expires: Tue, 31 Dec 1997 23:59:59 GMT
Content-Disposition: inline;
filename="files.zip";
Content-Type: application/zip
Transfer-Encoding: chunked
f.read() doesn't necessarily read the whole file, but just a packet of it (which might be the whole file if it's small, but won't be for a large file).
You need to loop over the packets like this:
while 1:
packet = f.read()
if not packet:
break
localFile.write(packet)
f.close()
f.read() returns an empty packet to signify that you've read the whole file.
If you don't mind reading the whole zip-file to memory, the fastest way to read and write it is as follows:
data = f.readlines()
with open(localFile,'wb') as output:
output.writelines(data)
Otherwise, to read and write in chunks as you get them over the network, do
with open(localFile, "wb") as output:
chunk = f.read()
while chunk:
output.write(chunk)
chunk = f.read()
This is a little less neat, but avoids keeping the whole file in memory at once. Hope it helps.
Here is a more robust solution using urllib2 to download the file in chunks and print the status of the download
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file
Try this:
#download file
f = o.open(remoteFileUrl)
response = ""
while 1:
data = f.read()
if not data:
break
response += data
with open(localFile, "wb") as local_file:
local_file.write(response)

Categories