urllib.request.urlopen not stop - python

I am trying to get html source by using urllib.request.open.
I don't know why my code did not stop.
from urllib.request import urlopen
url = "https://google.com"
try:
page = urlopen(url)
f = open("google.html","wb")
f.write(page.read())
except:
print("Error")
But with timeout, it received the source.
from urllib.request import urlopen
url = "https://google.com"
try:
page = urlopen(url, timeout=0.1)
f = open("google.html","wb")
f.write(page.read())
except:
print("Error")

Related

Why is this url not opening using python but can be opened directly from browser?

This is the code I wrote in python for opening a url.
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import requests
from random import randint
import urllib.parse
class AmazonReviews():
def __init__(self):
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def open_url(self,url):
values = {}
data = urllib.parse.urlencode(values).encode('utf-8')
req = urllib.request.Request(url, data, self.headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def fetch_reviews(self,all_reviews_link):
try:
url = "https://www.amazon.in" + all_reviews_link
print(url)
html = self.open_url(url)
except HTTPError as e:
print(e)
review = AmazonReviews()
review.fetch_reviews('/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8')
I am passing url as such because in the main project this url is scraped using href attribute that gives the relative path.
If there is any method to get absolute url please suggest.
Output -
https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8
HTTP Error 404: NotFound
Link of the code
https://onlinegdb.com/SyFPXzWVI
Use Selenium instead:
from selenium import webdriver
import os
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver")
link = "https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8"
browser.get(link)
name = browser.find_element_by_xpath('//*[#id="customer-profile-name-header"]/div[2]/span').text
Output:
Dheeraj Malhotra

Invalid URL Python urllib.request.urlopen

I am downloading pdf from a link, the link that is correct, but when it comes to accessing me throws an exception
Error
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/pnp/archivos/portal/doc/1305doc_NP 3215 DESTRUYEN POZA DE MACERACI%C3%93N Y GRAN CANTIDADDE INSUMOS QU%C3%8DMICOS.pdf' (found at least ' ')
Code
import unittest
from urlunshort3 import UrlUnshortener
from urllib.request import Request, urlopen
import urllib.request
def download_file2(download_url):
print(download_url)
url = download_url
response = urllib.request.urlopen(url)
data = response.read()
with open('C:/Users/usuario/Desktop/files/example.pdf', 'wb') as archivo:
archivo.write(data)
with open('C:/Users/usuario/Desktop/files/example.pdf', 'r') as archivo:
print("True")
download_file2(UrlUnshortener().resolve_short("http://bit" + ".ly/31wMeIN"))
Try this:
from urllib.request import Request, urlopen
import urllib.request
import urllib.parse
def download_file2(download_url):
print(download_url)
url = urllib.parse.quote(download_url)
response = urllib.request.urlopen(url)
data = response.read()
with open('C:/Users/usuario/Desktop/files/examle.pdf', 'wb') as archivo:
archivo.write(data)
with open('C:/Users/usuario/Desktop/files/example.pdf', 'r') as archivo:
print("True")
Also try urllib.parse.quote_plus() if your url contains spaces to change them to plus signs.
i think you can use wget
download from : https://pypi.org/project/wget/
import wget
wget.download(url)

Why am I getting connection refused exception for this Python script?

I was writing a Python script to grab lyrics of a song from azlyrics using the request module. This is the script I wrote:
import requests, re
from bs4 import BeautifulSoup as bs
url = "http://search.azlyrics.com/search.php"
payload = {'q' : 'shape of you'}
r = requests.get(url, params = payload)
soup = bs(r.text,"html.parser")
try:
link = soup.find('a', {'href':re.compile('http://www.azlyrics.com/lyrics/edsheeran/shapeofyou.html')})['href']
link = link.replace('http', 'https')
print(link)
raw_data = requests.get(link)
except Exception as e:
print(e)
but I got an exception stating :
Max retries exceeded with url: /lyrics/edsheeran/shapeofyou.html (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fbda00b37f0>: Failed to establish a new connection: [Errno 111] Connection refused',))
I read on the internet that I am probably trying to send too many requests. So I made the script sleep for some time :
import requests, re
from bs4 import BeautifulSoup as bs
from time import sleep
url = "http://search.azlyrics.com/search.php"
payload = {'q' : 'shape of you'}
r = requests.get(url, params = payload)
soup = bs(r.text,"html.parser")
try:
link = soup.find('a', {'href':re.compile('http://www.azlyrics.com/lyrics/edsheeran/shapeofyou.html')})['href']
link = link.replace('http', 'https')
sleep(60)
print(link)
raw_data = requests.get(link)
except Exception as e:
print(e)
but no luck!
So I tried the same with urllib.request
import requests, re
from bs4 import BeautifulSoup as bs
from time import sleep
from urllib.request import urlopen
url = "http://search.azlyrics.com/search.php"
payload = {'q' : 'shape of you'}
r = requests.get(url, params = payload)
soup = bs(r.text,"html.parser")
try:
link = soup.find('a', {'href':re.compile('http://www.azlyrics.com/lyrics/edsheeran/shapeofyou.html')})['href']
link = link.replace('http', 'https')
sleep(60)
print(link)
raw_data = urlopen(link).read()
except Exception as e:
print(e)
but then got different exception stating :
<urlopen error [Errno 111] Connection refused>
Can anyone one tell me whats wrong with it and how do I fix it?
Try it in your web browser; when you try to visit http://www.azlyrics.com/lyrics/edsheeran/shapeofyou.html it'll work fine, but when you try to visit https://www.azlyrics.com/lyrics/edsheeran/shapeofyou.html it won't work.
So remove your link = link.replace('http', 'https') line and try again.

searching google images from python

#!/usr/bin/env python
import urllib
import mechanize
from bs4 import BeautifulSoup
from urlparse import urlparse
def getPic(search):
search = search.replace(" ","%20")
try:
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-Agent','Mozilla')]
htmltext = browser.open("https://www.google.com/search?site=&tbm=isch&source=hp&biw=1855&bih=990&q=" + search + "&oq=" +search)
img_url = []
formatted_images = []
soup = BeautifulSoup(htmltext)
results = soup.findAll("a")
for r in results:
try:
if "imgres?imgurl" in r['href']:
img_url.append(r['href'])
except:
a=0
for im in img_url:
refer_url = urlparse(str(img_url[0]))
return refer_url.query.split("&")[0].replace("imgurl=","")
return formatted_images
except:
print "error"
print getPic("occupy wall street")
Instead of getting the link of an image as output I'm getting "[]" as an output.Can someone figure out what's the problem with my code.
Google sends "imgres?imgurl" only to browser with JavaScript
but mechanize.Browser() is like browser without JavaScript.
Turn off JavaScript in your browser and see HTML send by Google.

Error using urllib2 to get torrent content from a url

Python code:
import urllib2
import requests
info_hash = '00001BD2C9F364C7DCB759DEC6BE02F913C96F72'
url = 'http://torrage.com/torrent/%s.torrent' % info_hash
print url
data = urllib2.urlopen(url).read() # this data is err
# data = requests.get(url).content # this data is ok
f = open('%s.torrent' % info_hash, 'wb')
f.write(data)
f.close()
I can't get right torrent content by the code, but I can get right torrent by the url in browser.

Categories