I tried to test the connection to the website on PyCharm.
When I run the code I am faced with this error:
I use BeautifulSoup modules
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse(html):
soup = BeautifulSoup(html)
def main():
print(get_html('https://tap.az/'))
if __name__ == '__main__':
main()
C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\python.exe C:/Users/Adil/Desktop/PAbot/pabot.py
Traceback (most recent call last):
File "C:/Users/Adil/Desktop/PAbot/pabot.py", line 21, in <module>
main()
File "C:/Users/Adil/Desktop/PAbot/pabot.py", line 17, in main
print(get_html('https://tap.az/'))
File "C:/Users/Adil/Desktop/PAbot/pabot.py", line 8, in get_html
response = urllib.request.urlopen(url)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\Adil\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Try adding a user agent in your request header.
from bs4 import BeautifulSoup
import urllib.request
user_agent = 'Mozilla/5.0'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(url='https://tap.az/', headers=headers)
response = urllib.request.urlopen(request)
result = BeautifulSoup(response.read(), 'html.parser')
print(result)
Related
with this code I am reading a URL and using the data for filtration but urllib could not work
url = "myurl"
response = urllib.request.urlopen(url)
data = json.loads(response.read())
yesterday it was working well but now giving me error:
Traceback (most recent call last):
File "vaccine_survey.py", line 22, in <module>
response = urllib.request.urlopen(url)
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/usr/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
this works for me, here 'myurl' is a url address
from urllib.request import Request, urlopen
req = Request('myurl', headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req).read()
data = json.loads(response.read())
Problem
I'm using urllib.request.urlopen on the Wall Street Journal and it gives me a 404.
Details
Other sites work fine. Same error if I use https://. I did this example in REPL but the same error happens in my calls from my Django server:
>>> from urllib.request import urlopen
>>> urlopen('http://www.wsj.com')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
This is how it should work:
>>> urlopen('http://www.cbc.ca')
<http.client.HTTPResponse object at 0x10b0f8c88>
I'm not sure how to debug this. Anyone know what's going on, and how I can fix it?
first import Request like this:
from urllib.request import **Request**, urlopen
and then pass your url and header to Request like below:
url = 'https://www.wsj.com/'
response_obj = urlopen(Request(url, headers={'User-Agent': 'Mozilla/5.0'}))
print(response_obj)
I tested it now its working
I want to crawl a website with 786 pages. my code extracts data and save it to an excel file. when I run my program for 10 pages, it works fine but when I try to crawl whole 786 pages at one-time it gives me this error:
Traceback (most recent call last):
File "app.py", line 32, in <module>
crawl_names(i)
File "app.py", line 16, in crawl_names
html = urlopen(rq)
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 472, in open
response = meth(req, response)
File "/usr/lib/python3.5/urllib/request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.5/urllib/request.py", line 510, in error
return self._call_chain(*args)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 524: Origin Time-out
my code:
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import re
import xlwt
headers={'User-Agent': 'Mozilla/5.0'}
book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Sheet 1")
def crawl_names(page):
site = 'http://nex1music.ir/pages/' + str(page) + '/'
rq = Request(site, headers=headers)
html = urlopen(rq)
bsObj = BeautifulSoup(html, "lxml")
musics = bsObj.findAll("div", {"class": "pctn"})
names = []
for m in musics:
names.append(m.findChild().findAll("a", {"href": re.compile("http\:\/\/nex1music\.ir\/tag\/.+\/")})[-1].string)
for i in range(len(names)):
sheet1.write(i, page-1, names[i])
for i in range(1, 786):
crawl_names(i)
book.save("data.xls")
How can I change my code in order to crawl whole 786 pages ? thank you.
I'm trying to get glassdoor data from their API in Python:
import urllib2
id1 = 'x'
key = 'y'
action = 'employers'
company = 'company'
basepath = 'http://api.glassdoor.com/api/api.htm?v=1&format=json&t.p='
url = basepath + id1 + '&t.k=' + key + '&action=' + action + '&q=' + company + '&userip=192.168.43.42&useragent=Mozilla/5.0'
response = urllib2.urlopen(url)
html = response.read()
And I'm getting the following error:
>>> response = urllib2.urlopen(url)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "//anaconda/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "//anaconda/lib/python2.7/urllib2.py", line 437, in open
response = meth(req, response)
File "//anaconda/lib/python2.7/urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "//anaconda/lib/python2.7/urllib2.py", line 475, in error
return self._call_chain(*args)
File "//anaconda/lib/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "//anaconda/lib/python2.7/urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
Can anyone help...?
Thanks
below is the working code with some improvement by adding BeautifulSoup module and set User-Agent in hdr variable.
import urllib2, sys
from BeautifulSoup import BeautifulSoup
url = "http://api.glassdoor.com/api/api.htm?t.p=yourID&t.k=yourkey&userip=8.28.178.133&useragent=Mozilla&format=json&v=1&action=employers&q="
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url,headers=hdr)
response = urllib2.urlopen(req)
soup = BeautifulSoup(response)
Hope it help, thanks
I want to write some script with python which uses tor/proxy addresses to access web, for the test I have the following script:
import urllib2
from BeautifulSoup import BeautifulSoup
protocol = 'socks4'
ip = '127.0.0.1:9050'
proxy = urllib2.ProxyHandler({protocol:ip})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
page = urllib2.urlopen("http://www.ifconfig.me/ip").read()
print(page)
The problem is it shows my own IP address, while when run directly from terminal:
proxychains curl ifconfig.me/ip
shows different IP, how can I fix it?
when http used instead of socks 4 it gives the following error:
Traceback (most recent call last):
File "proxy_test.py", line 11, in <module>
page = urllib2.urlopen("http://www.ifconfig.me/ip").read()
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 501: Tor is not an HTTP Proxy
I use http (not sock) and it works
import urllib2
from BeautifulSoup import BeautifulSoup
protocol = 'http'
ip = '127.0.0.1:8118'
proxy = urllib2.ProxyHandler({protocol:ip})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
page = urllib2.urlopen("http://www.ifconfig.me/ip").read()
print(page)