import urllib2
def GetBrowserHtml_content(url):
req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8,gbk;q=0.7,*;q=0.3',
'Connection':'close',
'Referer':None
}
req_timeout = 5
request = urllib2.Request(url,None,req_header)
response = urllib2.urlopen(request,None,req_timeout)
html_content = response.read()
return html_content
url = 'http://www.ccdi.gov.cn/jlsc/index_4.html'
html_content = GetBrowserHtml_content(url)
I have a piece of code like above.
And when I run the code,I get the following error.
Traceback (most recent call last):
File "E:/Programming/python/CWSeg/spider/hahahha.py", line 34, in <module>
html_content = GetBrowserHtml_content(url)
File "E:/Programming/python/CWSeg/spider/hahahha.py", line 22, in GetBrowserHtml_content
response = urllib2.urlopen(request,None,req_timeout)
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 406, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 444, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 521:
Can anyone point out what im doing wrong? Thanks in advance.
Related
with this code I am reading a URL and using the data for filtration but urllib could not work
url = "myurl"
response = urllib.request.urlopen(url)
data = json.loads(response.read())
yesterday it was working well but now giving me error:
Traceback (most recent call last):
File "vaccine_survey.py", line 22, in <module>
response = urllib.request.urlopen(url)
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/usr/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
this works for me, here 'myurl' is a url address
from urllib.request import Request, urlopen
req = Request('myurl', headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req).read()
data = json.loads(response.read())
Why when I call a website with curl it works, but when call with python always return 429? I tried to set a lot different user-agent, cookies...
curl call:
curl "https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW?formatted=true&crumb=8ldhetOu7RJ&lang=en-US®ion=US&modules=summaryDetail&corsDomain=finance.yahoo.com"
response: {"quoteSummary":{"result":[{"summaryDetail":{"maxAge":1,"priceHint":{"raw":2,"fmt":"2","longFmt":"2"},"previousClose":{"raw":37.12,"fmt":"37.12"},"open":{"raw":37.19,"fmt":"37.19"},"dayLow":{"raw":37.12,"fmt":"37.12"},"dayHigh":{"raw":37.95,"fmt":"37.95"},"regularMarketPreviousClose":{"raw":37.12,"fmt":"37.12"},"regularMarketOpen":{"raw":37.19,"fmt":"37.19"},"regularMarketDayLow":{"raw":37.12,"fmt":"37.12"},"regularMarketDayHigh":{"raw":37.95,"fmt":"37.95"},"dividendRate":{"raw":0.88,"fmt":"0.88"},"dividendYield":{"raw":0.0232,"fmt":"2.32%"},"exDividendDate":{"raw":1605139200,"fmt":"2020-11-12"},"payoutRatio":{"raw":3.3077,"fmt":"330.77%"},"fiveYearAvgDividendYield":{"raw":2.43,"fmt":"2.43"},"beta":{"raw":1.173753,"fmt":"1.17"},"trailingPE":{"raw":148.82353,"fmt":"148.82"},"forwardPE":{"raw":20.294119,"fmt":"20.29"},"volume":{"raw":3372416,"fmt":"3.37M","longFmt":"3,372,416"},"regularMarketVolume":{"raw":3372416,"fmt":"3.37M","longFmt":"3,372,416"},"averageVolume":{"raw":4245485,"fmt":"4.25M","longFmt":"4,245,485"},"averageVolume10days":{"raw":3351485,"fmt":"3.35M","longFmt":"3,351,485"},"averageDailyVolume10Day":{"raw":3351485,"fmt":"3.35M","longFmt":"3,351,485"},"bid":{"raw":37.88,"fmt":"37.88"},"ask":{"raw":37.89,"fmt":"37.89"},"bidSize":{"raw":1100,"fmt":"1.1k","longFmt":"1,100"},"askSize":{"raw":800,"fmt":"800","longFmt":"800"},"marketCap":{"raw":28994179072,"fmt":"28.99B","longFmt":"28,994,179,072"},"yield":{},"ytdReturn":{},"totalAssets":{},"expireDate":{},"strikePrice":{},"openInterest":{},"fiftyTwoWeekLow":{"raw":17.44,"fmt":"17.44"},"fiftyTwoWeekHigh":{"raw":37.95,"fmt":"37.95"},"priceToSalesTrailing12Months":{"raw":2.6921244,"fmt":"2.69"},"fiftyDayAverage":{"raw":35.406857,"fmt":"35.41"},"twoHundredDayAverage":{"raw":31.052786,"fmt":"31.05"},"trailingAnnualDividendRate":{"raw":0.86,"fmt":"0.86"},"trailingAnnualDividendYield":{"raw":0.023168104,"fmt":"2.32%"},"navPrice":{},"currency":"USD","fromCurrency":null,"toCurrency":null,"lastMarket":null,"volume24Hr":{},"volumeAllCurrencies":{},"circulatingSupply":{},"algorithm":null,"maxSupply":{},"startDate":{},"tradeable":false}}],"error":null}}
with python:
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
result = requests.get('https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW?formatted=true&crumb=8ldhetOu7RJ&lang=en-US®ion=US&modules=summaryDetail&corsDomain=finance.yahoo.com', headers=headers)
print result.content
response:
Traceback (most recent call last):
File "a.py", line 35, in <module>
response = urllib.request.urlopen(req, jsondataasbytes)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: Too Many Requests
Ok, ok, solved by passing get parameters via params:
import requests
payload = {"modules": "summaryDetail"}
response = requests.get("https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW", params=payload)
print(response.json())
I was trying to get data from the following website but I get the error which is shown below. PFB the code for the same.
from urllib2 import urlopen
import bs4 as bs
response = urlopen('http://www.mec.ac.in/mec/stats2018.php')
html = response.read()
soup = bs.BeautifulSoup(response,'lxml')
print soup.title
PFB the error:
Traceback (most recent call last):
File "et.py", line 3, in <module>
response = urlopen('http://www.mec.ac.in/mec/stats2018.php')
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 435, in open
response = meth(req, response)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 473, in error
return self._call_chain(*args)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
How can I retrieve data after recovering this error?
The server specifically "blocks" requests with User-Agent header containing Python-urllib string (which urllib2/urllib sends by default):
In [1]: import requests
In [2]: url = "http://www.mec.ac.in/mec/stats2018.php"
In [3]: requests.get(url, headers={'User-Agent': 'Python-urllib/2.6'})
Out[3]: <Response [403]>
In [4]: requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'})
Out[4]: <Response [200]>
The following code:
req = urllib.request.Request(url=r"http://borel.slu.edu/cgi-bin/cc.cgi?foirm_ionchur=im&foirm=Seol&hits=1&format=xml",headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
handler = urllib.request.urlopen(req)
is giving me the following exception:
Traceback (most recent call last):
File "C:/Users/Foo/lang/old/test.py", line 46, in <module>
rip()
File "C:/Users/Foo/lang/old/test.py", line 36, in rip
handler = urllib.request.urlopen(req)
File "C:\Python32\lib\urllib\request.py", line 138, in urlopen
return opener.open(url, data, timeout)
File "C:\Python32\lib\urllib\request.py", line 375, in open
response = meth(req, response)
File "C:\Python32\lib\urllib\request.py", line 487, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python32\lib\urllib\request.py", line 413, in error
return self._call_chain(*args)
File "C:\Python32\lib\urllib\request.py", line 347, in _call_chain
result = func(*args)
File "C:\Python32\lib\urllib\request.py", line 495, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 500: Internal Server Error
but it works fine in my browser, whats the issue?
The server is rather b0rken. It responds with a 500 error in the browser as well.
You can catch the exception and still read the response:
import urllib.request
from urllib.error import HTTPError
req = urllib.request.Request(url=r"http://borel.slu.edu/cgi-bin/cc.cgi?foirm_ionchur=im&foirm=Seol&hits=1&format=xml",headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
try:
handler = urllib.request.urlopen(req)
except HTTPError as e:
content = e.read()
When it happened to me I've reduced the plt.figure size parameter and it worked. It may be some odd parameter on your code that is not being able to be read.
I have been tasked with creating a script that logs on to a corporate portal goes to a particular page, downloads the page, compares it to an earlier version and then emails a certain person depending on changes that have been made. The last parts are easy enough but it has been the first step that is giving me the most trouble.
After unsuccessfully using urllib2(I am trying to do this in python) to connect and about 4 or 5 hours of googling I have determined that the reason I can't connect is due to NTLM authentication on the web page. I have tried a bunch of different processes for connecting found on this site and others to no avail. Based on the NTLM example I have done:
import urllib2
from ntlm import HTTPNtlmAuthHandler
user = 'username'
password = "password"
url = "https://portal.whatever.com/"
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
# create the NTLM authentication handler
auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
# create and install the opener
opener = urllib2.build_opener(auth_NTLM)
urllib2.install_opener(opener)
# create a header
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
header = { 'Connection' : 'Keep-alive', 'User-Agent' : user_agent}
response = urllib2.urlopen(urllib2.Request(url, None, header))
When I run this (with a real username, password and url) I get the following:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "ntlm2.py", line 21, in <module>
response = urllib2.urlopen(urllib2.Request(url, None, header))
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 432, in error
result = self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 619, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 432, in error
result = self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 619, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 438, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 401: Unauthorized
The thing that is most interesting about this trace to me is that the final line says a 401 error was sent back. From what I have read the 401 error is the first message sent back to the client when NTLM is started. I was under the impression that the purpose of python-ntml was to handle the NTLM process for me. Is that wrong or am I just using it incorrectly? Also I'm not bounded to using python for this, so if there is an easier way to do this in another language let me know (From what I seen a-googling there isn't).
Thanks!
If the site is using NTLM authentication, the headers attribute of the resulting HTTPError should say so:
>>> try:
... handle = urllib2.urlopen(req)
... except IOError, e:
... print e.headers
...
<other headers>
WWW-Authenticate: Negotiate
WWW-Authenticate: NTLM