How do I download a zip file in python using urllib2? - python

Two part question. I am trying to download multiple archived Cory Doctorow podcasts from the internet archive. The old one's that do not come into my iTunes feed. I have written the script but the downloaded files are not properly formatted.
Q1 - What do I change to download the zip mp3 files?
Q2 - What is a better way to pass the variables into URL?
# and the base url.
def dlfile(file_name,file_mode,base_url):
from urllib2 import Request, urlopen, URLError, HTTPError
#create the url and the request
url = base_url + file_name + mid_url + file_name + end_url
req = Request(url)
# Open the url
try:
f = urlopen(req)
print "downloading " + url
# Open our local file for writing
local_file = open(file_name, "wb" + file_mode)
#Write to our local file
local_file.write(f.read())
local_file.close()
#handle errors
except HTTPError, e:
print "HTTP Error:",e.code , url
except URLError, e:
print "URL Error:",e.reason , url
# Set the range
var_range = range(150,153)
# Iterate over image ranges
for index in var_range:
base_url = 'http://www.archive.org/download/Cory_Doctorow_Podcast_'
mid_url = '/Cory_Doctorow_Podcast_'
end_url = '_64kb_mp3.zip'
#create file name based on known pattern
file_name = str(index)
dlfile(file_name,"wb",base_url
This script was adapted from here

Here's how I'd deal with the url building and downloading. I'm making sure to name the file as the basename of the url (the last bit after the trailing slash) and I'm also using the with clause for opening the file to write to. This uses a ContextManager which is nice because it will close that file when the block exits. In addition, I use a template to build the string for the url. urlopen doesn't need a request object, just a string.
import os
from urllib2 import urlopen, URLError, HTTPError
def dlfile(url):
# Open the url
try:
f = urlopen(url)
print "downloading " + url
# Open our local file for writing
with open(os.path.basename(url), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError, e:
print "HTTP Error:", e.code, url
except URLError, e:
print "URL Error:", e.reason, url
def main():
# Iterate over image ranges
for index in range(150, 151):
url = ("http://www.archive.org/download/"
"Cory_Doctorow_Podcast_%d/"
"Cory_Doctorow_Podcast_%d_64kb_mp3.zip" %
(index, index))
dlfile(url)
if __name__ == '__main__':
main()

An older solution on SO along the lines of what you want:
download a zip file to a local drive and extract all files to a destination folder using python 2.5
Python and urllib

Related

Solving HTTP Error 400: Bad Request with working links in Google Chrome

I know this has been asked in many forms already, but I can't seem to find my answer and hope to receive some help here.
I try to download files that are stored behind a list of urls..
I've found following function that should do what I want:
import os.path
import urllib.request
import requests
for link in links:
link = link.strip()
name = link.rsplit('/', 1)[-1]
filename = os.path.join('downloads', name)
if not os.path.isfile(filename):
print('Downloading: ' + filename)
try:
urllib.request.urlretrieve(link, filename)
except Exception as inst:
print(inst)
print(' Encountered unknown error. Continuing.')
I always receive: HTTP Error 400: Bad Request.
I tried to set user-agents to fake a browser visit (I use Google Chrome), but it did not help at all. The links work if copied in the browser, hence I wonder how to solve this.
Spaces have to be quoted. I've used quote function to quote filename in your link. Also I've used rindex to cut last part in url path. There is urlsplit and urlunsplit functions which should be used instead of string operations, but .. I'm too lazy :D
import os.path
import urllib.request
from urllib.parse import quote
links = ['https://undpgefpims.org/attachments/6222/216410/1717887/1724973/6222_4NC_3BUR_Macedonia_Final ProDoc 30 July 2018.doc', 'https://undpgefpims.org/attachments/6214/216405/1719672/1729436/6214_4NC_Niger_ProDoc final for DoA.doc']
for link in links:
link = link.strip()
name = link.rsplit('/', 1)[-1]
filename = os.path.join('downloads', name)
if not os.path.isfile(filename):
print('Downloading: ' + filename)
try:
urllib.request.urlretrieve(link[:link.rindex('/') + 1] + quote(link[link.rindex('/') + 1:]), filename)
except Exception as inst:
print(inst)
print(' Encountered unknown error. Continuing.')
I found the answer to my own question.
The problem was that the urls contained white spaces, which apparently can not be read in properly by urllib.request. The solution is to first parse the urls into quotes and then call the quoted url.
This is the working code for all that run into the same problem:
import os.path
import urllib.request
import requests
import urllib.parse
for link in urls:
link = link.strip()
name = link.rsplit('/', 1)[-1]
filename = os.path.join(name)
quoted_url = urllib.parse.quote(link, safe=":/")
if not os.path.isfile(filename):
print('Downloading: ' + filename)
try:
urllib.request.urlretrieve(quoted_url, filename)
except Exception as inst:
print(inst)
print(' Encountered unknown error. Continuing.')

Downloading csv data from an API

I am attempting to download csv data from an API which I will then edit I am struggling to get the different functions to work together.
i.e. passing the export link through to download the file and then through to opening it.
'''
File name: downloadAWR.py
Author: Harry&Joe
Date created: 3/10/17
Date last modified: 5/10/17
Version: 3.6
'''
import requests
import json
import urllib2
import zipfile
import io
import csv
import os
from urllib2 import urlopen, URLError, HTTPError
geturl() is used to create a download link for the csv data, one link will be created with user input data in this case the name and dates, this will then create a link that we can use to download the data. the link is stored in export_link
def geturl():
#getProjectName
project_name = 'BIMM'
#getApiToken
api_token = "API KEY HERE"
#getStartDate
start_date = '2017-01-01'
#getStopDate
stop_date = '2017-09-01'
url = "https://api.awrcloud.com/get.php?action=export_ranking&project=%s&token=%s&startDate=%s&stopDate=%s" % (project_name,api_token,start_date,stop_date)
export_link = requests.get(url).content
return export_link
dlfile is used to actually use the link a get a file we can manipulate and edit e.g. removing columns and some of the data.
def dlfile(export_link):
# Open the url
try:
f = urlopen(export_link)
print ("downloading " + export_link)
# Open our local file for writing
with open(os.path.basename(export_link), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError as e:
print ("HTTP Error:", e.code, export_link)
except URLError as e:
print ("URL Error:", e.reason, export_link)
return f
readdata is used to go into the file and open it for us to use.
def readdata():
with zipfile.ZipFile(io.BytesIO(zipdata)) as z:
for f in z.filelist:
csvdata = z.read(f)
#reader = csv.reader(io.StringIO(csvdata.decode()))
def main():
#Do something with the csv data
export_link = (geturl())
data = dlfile(export_link)
csvdata = data.readdata()
if __name__ == '__main__':
main()
Generally I'm finding that the code works independently but struggles when I try to put it all together synchronously.
You need to clean up and call your code appropriately. It seems you copy pasted from different sources and now you have some salad bowl of code that isn't mixing well.
If the task is just to read and open a remote file to do something to it:
import io
import zipfile
import requests
def get_csv_file(project, api_token, start_date, end_date):
url = "https://api.awrcloud.com/get.php"
params = {'action': 'export_ranking',
'project': project,
'token': api_token,
'startDate': start_date,
'stopDate': end_date}
r = requests.get(url, params)
r.raise_for_status()
return zipfile.ZipFile(io.BytesIO(request.get(r.content).content))
def process_csv_file(zip_file):
contents = zip_file.extractall()
# do stuff with the contents
if __name__ == '__main__':
process_zip_file(get_csv_file('BIMM', 'api-key', '2017-01-01', '2017-09-01'))

Python: Download multiple .gz files from single URL

I am having trouble downloading multiple network files from an online directory. I am using a virtual Linux environment (Lubuntu) over VMware.
My aim is to access a subfolder and download all the .gz files it contains into a new local directory that is different from the home directory. I tried multiple solutions and this is the closest I got.
import os
from urllib2 import urlopen, URLError, HTTPError
def dlfile(url):
# Open the url
try:
f = urlopen(url)
print "downloading " + url
# Open our local file for writing
with open(os.path.basename(url), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError, e:
print "HTTP Error:", e.code, url
except URLError, e:
print "URL Error:", e.reason, url
def main():
# Iterate over image ranges
for index in range(100, 250,5):
url = ("http://data.ris.ripe.net/rrc00/2016.01/updates20160128.0%d.gz"
%(index))
dlfile(url)
if __name__ == '__main__':
main()
The online directory needs no authentication, a link can be found here.
I tried string manipulation and using a loop over the filenames, but it gave me the following error:
HTTP Error: 404 http://data.ris.ripe.net/rrc00/2016.01/updates20160128.0245.gz
Look at the url
Good url: http://data.ris.ripe.net/rrc00/2016.01/updates.20160128.0245.gz
Bad url (your code): http://data.ris.ripe.net/rrc00/2016.01/updates20160128.0245.gz
A dot between updates and 2016 is missing

Scrape a jpg file on webpage, then saving it using python

OK I'm trying to scrape jpg image from Gucci website. Take this one as example.
http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg
I tried urllib.urlretrieve, which doesn't work becasue Gucci blocked the function. So I wanted to use requests to scrape the source code for the image and then write it into a .jpg file.
image = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg").text.encode('utf-8')
I encoded it because if I don't, it keeps telling me that gbk cannot encode the string.
Then:
with open('1.jpg', 'wb') as f:
f.write(image)
looks good right? But the result is -- the jpg file cannot be opened. There's no image! Windows tells me the jpg file is damaged.
What could be the problem?
I'm thinking that maybe when I scraped the image, I lost some information, or some characters are wrongly scraped. But how can I find out which?
I'm thinking that maybe some information is lost via encoding. But if I don't encode, I cannot even print it, not to mention writing it into a file.
What could go wrong?
I am not sure about the purpose of your use of encode. You're not working with text, you're working with an image. You need to access the response as binary data, not as text, and use image manipulation functions rather than text ones. Try this:
from PIL import Image
from io import BytesIO
import requests
response = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg")
bytes = BytesIO(response.content)
image = Image.open(bytes)
image.save("1.jpg")
Note the use of response.content instead of response.text. You will need to have PIL or Pillow installed to use the Image module. BytesIO is included in Python 3.
Or you can just save the data straight to disk without looking at what's inside:
import requests
response = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg")
with open('1.jpg','wb') as f:
f.write(response.content)
A JPEG file is not text, it's binary data. So you need to use the request.content attribute to access it.
The code below also includes a get_headers() function, which can be handy when you're exploring a Web site.
import requests
def get_headers(url):
resp = requests.head(url)
print("Status: %d" % resp.status_code)
resp.raise_for_status()
for t in resp.headers.items():
print('%-16s : %s' % t)
def download(url, fname):
''' Download url to fname '''
print("Downloading '%s' to '%s'" % (url, fname))
resp = requests.get(url)
resp.raise_for_status()
with open(fname, 'wb') as f:
f.write(resp.content)
def main():
site = 'http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/'
basename = '277520_F4CYG_4080_001_web_full_new_theme.jpg'
url = site + basename
fname = 'qtest.jpg'
try:
#get_headers(url)
download(url, fname)
except requests.exceptions.HTTPError as e:
print("%s '%s'" % (e, url))
if __name__ == '__main__':
main()
We call the .raise_for_status() method so that get_headers() and download() raise an Exception if something goes wrong; we catch the Exception in main() and print the relevant info.

How to download a few files simultaneusly from ftp in Python

I'm a newbie in Python programming.
My question is, how to download a few files at the same time. Not file by file but simultaneously from one directory on ftp. Now I use this script but I don't know how I can rebuild this code:
filenames = []
ftp.retrlines("NLST", filenames.append)
print filenames
print path
for filename in filenames:
local_filename = filename
print filename
print local_filename
f = open(local_filename, "wb")
s = ftp.size(local_filename)
sMB = s/(1024*1024)
print "file name: " + local_filename + "\nfile size: " + str(sMB) + " MB"
ftp.retrbinary("RETR %s" % local_filename, f.write)
print "\n Done :) "
time.sleep(2)
f.close()
ftp.quit() #closing connection
time.sleep(5)
It works fine, but not what I need.
You could use multiple threads or processes. Make sure you create a new ftplib.FTP object in each thread. The simplest way (code-wise) is to use multiprocessing.Pool:
#!/usr/bin/env python
from multiprocessing.dummy import Pool # use threads
try:
from urllib import urlretrieve
except ImportError: # Python 3
from urllib.request import urlretrieve
def download(url):
url = url.strip()
try:
return urlretrieve(url, url2filename(url)), None
except Exception as e:
return None, e
if __name__ == "__main__":
p = Pool(20) # specify number of concurrent downloads
print(p.map(download, open('urls'))) # perform parallel downloads
where urls contains ftp urls for the files to download e.g., ftp://example.com/path/to/file and url2filename() extracts the filename part from an url e.g.:
import os
import posixpath
try:
from urlparse import urlsplit
from urllib import unquote
except ImportError: # Python 3
from urllib.parse import urlsplit, unquote
def url2filename(url, encoding='utf-8'):
"""Return basename corresponding to url.
>>> print url2filename('http://example.com/path/to/dir%2Ffile%C3%80?opt=1')
fileƀ
"""
urlpath = urlsplit(url).path
basename = posixpath.basename(unquote(urlpath))
if os.path.basename(basename) != basename:
raise ValueError(url) # reject 'dir%5Cbasename.ext' on Windows
return basename

Categories