Unable to download file,Python xlsx File download zero bytes - python

After running the code,downloaded file is 0bytes. I tried writing response too,also tried using buffer
What am i doing wrong,what else can i try? please help
import urllib2
from bs4 import BeautifulSoup
import os
import pandas as pd
storePath='/home/vinaysawant/BankIFSCCodes/'
def DownloadFiles():
# Remove the trailing / you had, as that gives a 404 page
url='https://rbi.org.in/scripts/Bs_viewcontent.aspx?Id=2009'
conn = urllib2.urlopen(url)
html = conn.read().decode('utf-8')
soup = BeautifulSoup(html, "html.parser")
# Select all A elements with href attributes containing URLs starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
# Make sure it has one of the correct extensions
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
print href
print("Downloading %s to %s..." % (href, filename) )
#urlretrieve(href, filename)
u = urllib2.urlopen(href)
f = open(storePath+filename, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (filename, file_size)
print("Done.")
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8) * (len(status) + 1)
print status,
f.close()
exit(1)
DownloadFiles()
i also tried
import urllib
urllib.retreive(url)
I tried using urllib2 urllib3 as well.

I am not good with pandas and urllib2 but since there is no answer for this question. I think the problem is you are trying to download the first url
url='https://rbi.org.in/scripts/Bs_viewcontent.aspx?Id=2009
you define it here and doesnt change it then
u = urllib2.urlopen(url)
after that you try to download the thing associated with the url
buffer = u.read(block_sz)
Instead of them I guess you should try to download the href
so try to change this
u = urllib2.urlopen(url)
with that
u = urllib2.urlopen(href)

The problem is that redirection to HTTPS is done via js instead of HTTP headers, so urllib doesn't follow. However you could use replace on the links and change the protocol manually.
href = link.get('href').replace('http://', 'https://')
While that resolves the issue, it's not a bad idea to have urlopen in a try-except block.
try:
u = urllib2.urlopen(href)
except Exception as e:
print(e)
continue

Related

How can I get frequently updated .php text from a website in Python using BeautifulSoup4?

I would like create an automatic script to download a .php textfile from a webpage which is frequently updated. My program uses requests to get the webpage.
The code:
import os, pathlib, subprocess,requests, time, sys
url = 'http://metar.vatsim.net/metar.php?id=all'
current_dir = pathlib.Path(__file__).parent
os.chdir(current_dir)
icao = sys.argv[1]
fp = requests.get(url)
mybytes = fp.read()
mystr = mybytes.decode("utf8")
fp.close()
dict = {}
fls = str.splitlines(mystr)
for x in range(len(fls)):
cur = str.split(fls[x])
dict[cur[0]] = " ".join(cur)
try:
print(dict[icao])
except:
print('INCORRECT FORMAT OR AIRPORT ID\n')
When I try to read fp, it shows the err:
mybytes = fp.read()
AttributeError: 'Response' object has no attribute 'read'
Is there a better way to solve this, I am kind of stuck.
What you are looking for is urllib.request, not requests.
Maybe this will work:
import urllib.request
fp = urllib.request.urlopen(url)
mybytes = fp.read()
mystr = mybytes.decode("utf8")
fp.close()
This will read the text present in http://metar.vatsim.net/metar.php?id=all.
You can absolutely use requests. You then want to extract the .text.
Also, don't overwrite inbuilt dict in the way you are doing.
import requests
url = 'http://metar.vatsim.net/metar.php?id=all'
fp = requests.get(url)
mystr = fp.text
a_dict = {}
fls = str.splitlines(mystr)
for x in range(len(fls)):
cur = str.split(fls[x])
a_dict[cur[0]] = " ".join(cur)
try:
print(a_dict)
except:
print('INCORRECT FORMAT OR AIRPORT ID\n')

How to make this program use instagram pic urls and download? [duplicate]

This question already has answers here:
Download large file in python with requests
(8 answers)
Closed 2 years ago.
The goal is for the program to take user given instagram url and allow to download and save a picture.
I've got the main part in place but cant understand how I can go further and use the filtered and right url to download and save the picture on my computer.
My code so far:
EDIT: I added a download line but can't seem to get the right file type? I mean it saves as whatever I want it to but I can't open it:
import requests
import re
import shutil
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
url = input('Enter Instagram URL: ')
response = get_response(url)
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'Instagram picture.jpg'
urlretrieve(url, dst)
#EDIT ^
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')
I think this might help...
import requests
from bs4 import BeautifulSoup as bs
import json
import os.path
insta_url = 'https://www.instagram.com'
inta_username = input('enter username of instagram : ')
response = requests.get(f"{insta_url}/{inta_username}/")
if response.ok:
html = response.text
bs_html = bs(html, features="lxml")
bs_html = bs_html.text
index = bs_html.find('profile_pic_url_hd')+21
remaining_text = bs_html[index:]
remaining_text_index = remaining_text.find('requested_by_viewer')-3
string_url = remaining_text[:remaining_text_index].replace("\\u0026", "&")
print(string_url, "\ndownloading...")
while True:
filename = 'pic_ins.jpg'
file_exists = os.path.isfile(filename)
if not file_exists:
with open(filename, 'wb+') as handle:
response = requests.get(string_url, stream=True)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
else:
continue
break
print("completed")
You can change the name of the image downloaded by changing the filename variable

urllib.request -function returns None instead of expected value

def fetch_html(url):
# ungood idea to assume its UTF-8. Try to read header
try:
fp = urllib.request.urlopen(url)
fpbytes = fp.read()
html = fpbytes.decode("utf8")
fp.close()
print("Success! {} chars found".format(len(html)))
return html
except:
print("Failed to extract html, retrying again in a few seconds")
time.sleep(3.5)
fetch_html(url)
url = "https://i.reddit.com/r/AskReddit/top/.compact?sort=top&t=day"
html = fetch_html(url)
print(html)
html is still None despite it giving 70000 in len(html), What gives?
I tried switching the order, placing fp.close() after return html, but it still gives the same error.
I have searched for this in google, though their issue comes from not using return on their values, which is different in this question.
SOLVED: https://gist.github.com/carcigenicate/ff1523fa66602a1c47b7c5ae4d6f1e92
def fetch_html(url):
while True:
try:
fp = urllib.request.urlopen(url)
fpbytes = fp.read()
html = fpbytes.decode("utf8")
fp.close()
print("Success! {} chars found".format(len(html)))
return html
except:
print("Failed to extract html, retrying again in a few seconds")
time.sleep(3.5)

Can't edit a URL with python

I am new to python and just wanted to know if this is possible: I have scraped a url using urllib and want to edit different pages.
Example:
http://test.com/All/0.html
I want the 0.html to become 50.html and then 100.html and so on ...
found_url = 'http://test.com/All/0.html'
base_url = 'http://test.com/All/'
for page_number in range(0,1050,50):
url_to_fetch = "{0}{1}.html".format(base_url,page_number)
That should give you URLs from 0.html to 1000.html
If you want to use urlparse(as suggested in comments to your question):
import urlparse
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
print parsed_url.geturl()
Executing this script would give you the following:
http://test.com/All/0.html
http://test.com/All/50.html
http://test.com/All/100.html
http://test.com/All/150.html
http://test.com/All/200.html
http://test.com/All/250.html
http://test.com/All/300.html
http://test.com/All/350.html
http://test.com/All/400.html
http://test.com/All/450.html
http://test.com/All/500.html
http://test.com/All/550.html
http://test.com/All/600.html
http://test.com/All/650.html
http://test.com/All/700.html
http://test.com/All/750.html
http://test.com/All/800.html
http://test.com/All/850.html
http://test.com/All/900.html
http://test.com/All/950.html
http://test.com/All/1000.html
Instead of printing in the for loop you can use the value of parsed_url.geturl() as per your need. As mentioned, if you want to fetch the content of the page you can use python requests module in the following manner:
import requests
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
# print parsed_url.geturl()
url = parsed_url.geturl()
try:
r = requests.get(url)
if r.status_code == 200:
with open(str(page_number)+'.html', 'w') as f:
f.write(r.content)
except Exception as e:
print "Error scraping - " + url
print e
This fetches the content from http://test.com/All/0.html till http://test.com/All/1000.html and saves the content of each URL into its own file. The file name on disk would be the file name in URL - 0.html to 1000.html
Depending on the performance of the site you are trying to scrape from you might experience considerable time delays in running the script. If performance is of importance, you can consider using grequests

Downloading a image using Python Mechanize

I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.
Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.
You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())
It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')

Categories