Python: How to download a zip file - python

I'm attempting to download a zip file using this code:
o = urllib2.build_opener( urllib2.HTTPCookieProcessor() )
#login
p = urllib.urlencode( { usernameField: usernameVal, passField: passVal } )
f = o.open(authUrl, p )
data = f.read()
print data
f.close()
#download file
f = o.open(remoteFileUrl)
localFile = open(localFile, "wb")
localFile.write(f.read())
f.close()
I am getting some binary data, but the size of the file I "downloaded" is too small and is not a valid zip file. Am I not retrieving the zip file properly? The HTTP response header for f = o.open(remoteFileUrl) is shown below. I don't know if special processing is needed to handle this response:
HTTP/1.1 200 OK Server:
Apache-Coyote/1.1 Pragma: private
Cache-Control: must-revalidate
Expires: Tue, 31 Dec 1997 23:59:59 GMT
Content-Disposition: inline;
filename="files.zip";
Content-Type: application/zip
Transfer-Encoding: chunked

f.read() doesn't necessarily read the whole file, but just a packet of it (which might be the whole file if it's small, but won't be for a large file).
You need to loop over the packets like this:
while 1:
packet = f.read()
if not packet:
break
localFile.write(packet)
f.close()
f.read() returns an empty packet to signify that you've read the whole file.

If you don't mind reading the whole zip-file to memory, the fastest way to read and write it is as follows:
data = f.readlines()
with open(localFile,'wb') as output:
output.writelines(data)
Otherwise, to read and write in chunks as you get them over the network, do
with open(localFile, "wb") as output:
chunk = f.read()
while chunk:
output.write(chunk)
chunk = f.read()
This is a little less neat, but avoids keeping the whole file in memory at once. Hope it helps.

Here is a more robust solution using urllib2 to download the file in chunks and print the status of the download
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file

Try this:
#download file
f = o.open(remoteFileUrl)
response = ""
while 1:
data = f.read()
if not data:
break
response += data
with open(localFile, "wb") as local_file:
local_file.write(response)

Related

Memory optimization while downloading and saving data to s3 using python

filename ="test.zip"
url = "some url"
data= requests.get(url, stream= True)
f= BytesIO()
f.write(data.content)
try:
s3_r = boto3.resource('s3')
s3_r.Object(bucket,filename).put(body= f.getvalue())
return filename
except:
print('failed')
Note: The zip file is greater than 1GB
my code is working fine but its taking a lot of memory, need to optimize the code that memory usage should not be greater then 200mb
Found below code for Chunk
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
but not able save f.write(chunk) to s3 bucket

How to create files with names from a file and then writing to files? Python. API request

There is an algorithm in the end of the text. It reads lines from the file SP500.txt. File contains strings and it looks like:
AAA
BBB
CCC
Substitutes these strings in the get request and saves the entire url to a file url_requests.txt. For the example:
https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d
https://apidate.com/api/api/BBB.US?api_token=XXXXXXXX&period=d
https://apidate.com/api/api/CCC.US?api_token=XXXXXXXX&period=d
and then processes each request via the API and adds all responses to get requests to responses.txt.
I don't know how to save the response from each request from the file url_requests.txt into separate csv file instead of responses.txt (now they are all written to this file, and not separately). In this case, it is important to name each file with the corresponding line from the file SP500.txt. For example:
AAA.csv `(which contains data from the request response https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d)`
BBB.csv `(which contains data from the request response https://apidate.com/api/api/BBB.US?api_token=XXXXXXXX&period=d)`
CCC.csv `(which contains data from the request response https://apidate.com/api/api/CCC.US?api_token=XXXXXXXX&period=d)`
So, algorithm is:
import requests
# to use strip to remove spaces in textfiles.
import sys
# two variables to squeeze a string between these two so it will become a full uri
part1 = 'https://apidate.com/api/api/'
part2 = '.US?api_token=XXXXXXXX&period=d'
# open the outputfile before the for loop
text_file = open("url_requests.txt", "w")
# open the file which contains the strings
with open('SP500.txt', 'r') as f:
for i in f:
uri = part1 + i.strip(' \n\t') + part2
print(uri)
text_file.write(uri)
text_file.write("\n")
text_file.close()
# open a new file textfile for saving the responses from the api
text_file = open("responses.txt", "w")
# send every uri to the api and write the respones to a textfile
with open('url_requests.txt', 'r') as f2:
for i in f2:
uri = i.strip(' \n\t')
batch = requests.get(i)
data = batch.text
print(data)
text_file.write(data)
text_file.write('\n')
text_file.close()
And I know how to save csv from this response. It is like:
import csv
import requests
url = "https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d"
response = requests.get(url)
with open('out.csv', 'w') as f:
writer = csv.writer(f)
for line in response.iter_lines():
writer.writerow(line.decode('utf-8').split(','))
To save in different names you have to use open() and write() inside for-loop when you read data.
It would good to read all names to list and later generate urls and also keep on list so you would not have to read them.
When I see code which you use to save csv then it looks like you get csv from server so you could save all at once using open() write() without csv module.
I see it in this way.
import requests
#import csv
# --- read names ---
all_names = [] # to keep all names in memory
with open('SP500.txt', 'r') as text_file:
for line in text_file:
line = line.strip()
print('name:', name)
all_names.append(line)
# ---- generate urls ---
url_template = 'https://apidate.com/api/api/{}.US?api_token=XXXXXXXX&period=d'
all_uls = [] # to keep all urls in memory
with open("url_requests.txt", "w") as text_file:
for name in all_names:
url = url_template.format(name)
print('url:', url)
all_uls.append(url)
text_file.write(url + "\n")
# --- read data ---
for name, url in zip(all_names, all_urls):
#print('name:', name)
#print('url:', url)
response = requests.get(url)
with open(name + '.csv', 'w') as text_file:
text_file.write(response.text)
#writer = csv.writer(text_file)
#for line in response.iter_lines():
# writer.writerow(line.decode('utf-8').split(',')
You could calculate a filename for every string i, and open (create) a file each time.
Something like this:
import sys
import requests
# two variables to squeeze a string between these two so it will become a full uri
part1 = 'https://apidate.com/api/api/'
part2 = '.US?api_token=XXXXXXXX&period=d'
# open the outputfile before the for loop
text_file = open("url_requests.txt", "w")
uri_dict = {}
with open('SP500.txt', 'r') as f:
for i in f:
uri = part1 + i.strip(' \n\t') + part2
print(uri)
text_file.write(uri)
text_file.write("\n")
uri_dict[i] = uri
text_file.close()
for symbol, uri in uri_dict:
batch = requests.get(uri)
data = batch.text
print(data)
#create the filename
filename = symbol+".csv"
#open (create) the file and save the data
with open(filename, "w") as f:
f.write(data)
f.write('\n')
You could also get rid of url_requests.csv, which becomes useless (until you have other uses for it).

How to convert base64 encoded string to file without disk I/O in Python (for HTTP requests)

I want to send a file through requests library in Python, it accepts file type input. Here is a piece of working code I've tested myself.
FILES = []
f = open('/home/dummy.txt', 'rb')
# f : <type 'file'>
FILES.append(('file', f))
response = requests.post(url = 'https://dummy.com/dummy', headers = {'some_header':'content'}, data={'some_payload':'content'}, files=FILES)
This works just fine, now I have a base64 encoded string I just got from my database
base64_string = "Q3VyaW91cywgYXJlbid0IHlvdT8="
FILES = []
f = convert_base64_to_file(base64_string, "dummy.txt")
# f : <type 'file'>
FILES.append(('file', f))
response = requests.post(url = 'https://dummy.com/dummy', headers = {'some_header':'content'}, data={'some_payload':'content'}, files=FILES)
I need the convert_base64_to_file (which is an imaginary method). How do I achieve this without any disk I/O?
My base64_string doesn't have a filename. How do I simulate the filename so that it behaves just like I open a file from my disk?
I need the http request to post this:
------WebKitFormBoundary7MA4YWxkTrZu0gW
Content-Disposition: form-data; name="file"; filename="dummy.txt"
Content-Type: application/x-object
... contents of file goes here ...
That's why I need to specify the filename.
You can use the io module:
import io
FILES = {}
f = io.BytesIO()
FILES{'file'] = ("dummy.txt", f)
response = requests.post(url = 'https://dummy.com/dummy', headers = {'some_header':'content'}, data={'some_payload':'content'}, files=FILES)
Here is the convert_base64_to_file function:
import base64
import io
def convert_base64_to_file(data):
return io.BytesIO(base64.b64decode(data))
To complete Megalng answer which solves half of my problem, we can add a name to the constructed file by doing this:
import base64
import io
def convert_base64_to_file(base64_string,filename):
f = io.BytesIO(base64.b64decode(base64_string))
f.name = filename
return f

Check response using urllib2

I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26&current_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.

urllib2 does not download pdf file

I am using the following code to download my files:
def downloadfile(url): #function to download file
file_name = filename_parse(url)
#print "***********************"
#print "File download started:"
#stime= time.time()
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
getfilesize(u)
file_size = getfilesize(u)
print "Downloading: %s Bytes: %s \n" % (file_name, file_size)
file_size_dl = 0
block_sz = 512
progressbar(u,block_sz,file_size_dl,f,file_size)
f.close()
the thing is that it can download any file exe, txt and others except .pdf files...how can i make it download the pdfs ?
I know this is an old question but for all of those that stumble upon it and are tyring to download the pdf file using python 2 and urllib2 here is the code:
import urllib2
url = 'http://mensenhandel.nl/files/pdftest2.pdf'
print "Download started..."
f = urllib2.urlopen(url)
data = f.read()
with open("test.pdf", "wb") as code:
code.write(data)
print "Download completed..."
Just modify the URL to your needs...
Source: http://www.blog.pythonlibrary.org/2012/06/07/python-101-how-to-download-a-file/

Categories