Why when I call a website with curl it works, but when call with python always return 429? I tried to set a lot different user-agent, cookies...
curl call:
curl "https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW?formatted=true&crumb=8ldhetOu7RJ&lang=en-US®ion=US&modules=summaryDetail&corsDomain=finance.yahoo.com"
response: {"quoteSummary":{"result":[{"summaryDetail":{"maxAge":1,"priceHint":{"raw":2,"fmt":"2","longFmt":"2"},"previousClose":{"raw":37.12,"fmt":"37.12"},"open":{"raw":37.19,"fmt":"37.19"},"dayLow":{"raw":37.12,"fmt":"37.12"},"dayHigh":{"raw":37.95,"fmt":"37.95"},"regularMarketPreviousClose":{"raw":37.12,"fmt":"37.12"},"regularMarketOpen":{"raw":37.19,"fmt":"37.19"},"regularMarketDayLow":{"raw":37.12,"fmt":"37.12"},"regularMarketDayHigh":{"raw":37.95,"fmt":"37.95"},"dividendRate":{"raw":0.88,"fmt":"0.88"},"dividendYield":{"raw":0.0232,"fmt":"2.32%"},"exDividendDate":{"raw":1605139200,"fmt":"2020-11-12"},"payoutRatio":{"raw":3.3077,"fmt":"330.77%"},"fiveYearAvgDividendYield":{"raw":2.43,"fmt":"2.43"},"beta":{"raw":1.173753,"fmt":"1.17"},"trailingPE":{"raw":148.82353,"fmt":"148.82"},"forwardPE":{"raw":20.294119,"fmt":"20.29"},"volume":{"raw":3372416,"fmt":"3.37M","longFmt":"3,372,416"},"regularMarketVolume":{"raw":3372416,"fmt":"3.37M","longFmt":"3,372,416"},"averageVolume":{"raw":4245485,"fmt":"4.25M","longFmt":"4,245,485"},"averageVolume10days":{"raw":3351485,"fmt":"3.35M","longFmt":"3,351,485"},"averageDailyVolume10Day":{"raw":3351485,"fmt":"3.35M","longFmt":"3,351,485"},"bid":{"raw":37.88,"fmt":"37.88"},"ask":{"raw":37.89,"fmt":"37.89"},"bidSize":{"raw":1100,"fmt":"1.1k","longFmt":"1,100"},"askSize":{"raw":800,"fmt":"800","longFmt":"800"},"marketCap":{"raw":28994179072,"fmt":"28.99B","longFmt":"28,994,179,072"},"yield":{},"ytdReturn":{},"totalAssets":{},"expireDate":{},"strikePrice":{},"openInterest":{},"fiftyTwoWeekLow":{"raw":17.44,"fmt":"17.44"},"fiftyTwoWeekHigh":{"raw":37.95,"fmt":"37.95"},"priceToSalesTrailing12Months":{"raw":2.6921244,"fmt":"2.69"},"fiftyDayAverage":{"raw":35.406857,"fmt":"35.41"},"twoHundredDayAverage":{"raw":31.052786,"fmt":"31.05"},"trailingAnnualDividendRate":{"raw":0.86,"fmt":"0.86"},"trailingAnnualDividendYield":{"raw":0.023168104,"fmt":"2.32%"},"navPrice":{},"currency":"USD","fromCurrency":null,"toCurrency":null,"lastMarket":null,"volume24Hr":{},"volumeAllCurrencies":{},"circulatingSupply":{},"algorithm":null,"maxSupply":{},"startDate":{},"tradeable":false}}],"error":null}}
with python:
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
result = requests.get('https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW?formatted=true&crumb=8ldhetOu7RJ&lang=en-US®ion=US&modules=summaryDetail&corsDomain=finance.yahoo.com', headers=headers)
print result.content
response:
Traceback (most recent call last):
File "a.py", line 35, in <module>
response = urllib.request.urlopen(req, jsondataasbytes)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: Too Many Requests
Ok, ok, solved by passing get parameters via params:
import requests
payload = {"modules": "summaryDetail"}
response = requests.get("https://query2.finance.yahoo.com/v10/finance/quoteSummary/GLW", params=payload)
print(response.json())
Related
Today I tried to make simple requests to a Website and implement custom headers that I defined into a separate file called useragents.txt. I now wasted lot of time working on it to get it to work. The issue is, that python wont request the site during a valueerror: Invalid header value b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\n'
I'm not sure why it add a b' and a \n there. If I print the variable the output is without these symbols. Here a little code that might show better what I mean:
def get_soup(url, header):
time.sleep(random.choice([1, 2, 3]))
return BeautifulSoup(urlopen(Request(url, headers=header)), "html.parser")
with open("useragents.txt", "r") as user_agents_file:
user_agents_lines = user_agents_file.read().splitlines()
print(user_agents_lines)
# count
user_agent = random.choice(user_agents_lines)
print(f"USER-AGENT: {user_agent}")
# for user_agent in user_agents_lines:
# count += 1
# print(f"Line{count}: {user_agent.strip()}")
The Full error is:
Traceback (most recent call last):
File "D:\my_python_projects\testingcodes\firstcode\main.py", line 118, in <module>
scraper() # run the function
File "D:\my_python_projects\testingcodes\firstcode\main.py", line 69, in scraper
soup = get_soup(surveyingurl, testheader)
File "D:\my_python_projects\testingcodes\firstcode\main.py", line 43, in get_soup
return BeautifulSoup(urlopen(Request(url, headers=header)), "html.parser")
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "D:\Downloads\Python\python 3.9.6\lib\urllib\request.py", line 1346, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "D:\Downloads\Python\python 3.9.6\lib\http\client.py", line 1257, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Downloads\Python\python 3.9.6\lib\http\client.py", line 1298, in _send_request
self.putheader(hdr, value)
File "D:\Downloads\Python\python 3.9.6\lib\http\client.py", line 1235, in putheader
raise ValueError('Invalid header value %r' % (values[i],))
ValueError: Invalid header value b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\n'
Process finished with exit code 1
Below a screenshot that shows how my useragents.txt file looks like:
with open("useragents.txt", "r") as user_agents_file:
user_agents_lines = user_agents_file.read().splitlines()
print(user_agents_lines)
user_agent = random.choice(user_agents_lines)
user_agent = user_agent.replace(b'\n', b'')
print(f"USER-AGENT: {user_agent}")
I was trying to get data from the following website but I get the error which is shown below. PFB the code for the same.
from urllib2 import urlopen
import bs4 as bs
response = urlopen('http://www.mec.ac.in/mec/stats2018.php')
html = response.read()
soup = bs.BeautifulSoup(response,'lxml')
print soup.title
PFB the error:
Traceback (most recent call last):
File "et.py", line 3, in <module>
response = urlopen('http://www.mec.ac.in/mec/stats2018.php')
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 435, in open
response = meth(req, response)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 473, in error
return self._call_chain(*args)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/local/Cellar/python#2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
How can I retrieve data after recovering this error?
The server specifically "blocks" requests with User-Agent header containing Python-urllib string (which urllib2/urllib sends by default):
In [1]: import requests
In [2]: url = "http://www.mec.ac.in/mec/stats2018.php"
In [3]: requests.get(url, headers={'User-Agent': 'Python-urllib/2.6'})
Out[3]: <Response [403]>
In [4]: requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'})
Out[4]: <Response [200]>
I am following Bokeh's User Guide.
In "Embedding Bokeh Server as a Library" at http://docs.bokeh.org/en/latest/docs/user_guide/server.html#embedding-bokeh-server-as-a-library
it refers to a demo where a Bokeh server is embedded in Flask (https://github.com/bokeh/bokeh/blob/0.12.6/examples/howto/server_embed/flask_embed.py)
It should be straightforward but I get a Tornado error if launched with python flask_embed.py. Anybody has an idea WHY?
The page on the browser is correctly launched but there is no plot.
This is the short error message:
ERROR:tornado.application:Uncaught exception GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:5006', method='GET', uri='/bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp', version='HTTP/1.1', remote_ip='127.0.0.1', headers={'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Host': 'localhost:5006', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Connection': 'keep-alive', 'Referer': 'http://localhost:8080/', 'Cookie': 'username-localhost-8888="2|1:0|10:1501067928|23:username-localhost-8888|44:Y2EwOTUzN2YzNWRiNGQyMDgxZWEyOGMzZDJkOTI4ZWY=|f4f981dd915dc777c70e605b7135bcbbc076b3fe3482999e5ca557cb4abd518e"; _xsrf=2|c711b8e7|f913ccc5c9cc32532c1e67bbd75b6051|1500889250'})
...
HTTPError: HTTP Error 400: Bad Request
ERROR:tornado.access:500 GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
And here the whole traceback:
Opening Flask app with embedded Bokeh application on http://localhost:8080/
ERROR:tornado.application:Uncaught exception GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:5006', method='GET', uri='/bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp', version='HTTP/1.1', remote_ip='127.0.0.1', headers={'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Host': 'localhost:5006', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Connection': 'keep-alive', 'Referer': 'http://localhost:8080/', 'Cookie': 'username-localhost-8888="2|1:0|10:1501067928|23:username-localhost-8888|44:Y2EwOTUzN2YzNWRiNGQyMDgxZWEyOGMzZDJkOTI4ZWY=|f4f981dd915dc777c70e605b7135bcbbc076b3fe3482999e5ca557cb4abd518e"; _xsrf=2|c711b8e7|f913ccc5c9cc32532c1e67bbd75b6051|1500889250'})
Traceback (most recent call last):
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/web.py", line 1511, in _execute
result = yield result
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/views/autoload_js_handler.py", line 31, in get
session = yield self.get_session()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/views/session_handler.py", line 40, in get_session
session = yield self.application_context.create_session_if_needed(session_id, self.request)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1069, in run
yielded = self.gen.send(value)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/application_context.py", line 177, in create_session_if_needed
self._application.initialize_document(doc)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/application/application.py", line 121, in initialize_document
h.modify_document(doc)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/application/handlers/function.py", line 16, in modify_document
self._func(doc)
File "main.py", line 22, in modify_doc
df = pd.read_csv(data_url, parse_dates=True, index_col=0)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/parsers.py", line 655, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/parsers.py", line 392, in _read
filepath_or_buffer, encoding, compression)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/common.py", line 186, in get_filepath_or_buffer
req = _urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 435, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 473, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 400: Bad Request
ERROR:tornado.access:500 GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1) 425.75ms
When the page is served, the server tries to load CSV data from an external URL using Pandas. I'm not sure whether this example worked before, but right now it seems that pd.read_csv does not encode URL query, so the server is unable to treat characters > and <. You can either replace the characters manually (refer to https://en.wikipedia.org/wiki/Percent-encoding) or use some library for it, like Python's urllib.
import urllib2
def GetBrowserHtml_content(url):
req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8,gbk;q=0.7,*;q=0.3',
'Connection':'close',
'Referer':None
}
req_timeout = 5
request = urllib2.Request(url,None,req_header)
response = urllib2.urlopen(request,None,req_timeout)
html_content = response.read()
return html_content
url = 'http://www.ccdi.gov.cn/jlsc/index_4.html'
html_content = GetBrowserHtml_content(url)
I have a piece of code like above.
And when I run the code,I get the following error.
Traceback (most recent call last):
File "E:/Programming/python/CWSeg/spider/hahahha.py", line 34, in <module>
html_content = GetBrowserHtml_content(url)
File "E:/Programming/python/CWSeg/spider/hahahha.py", line 22, in GetBrowserHtml_content
response = urllib2.urlopen(request,None,req_timeout)
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 406, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 444, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 521:
Can anyone point out what im doing wrong? Thanks in advance.
The following code:
req = urllib.request.Request(url=r"http://borel.slu.edu/cgi-bin/cc.cgi?foirm_ionchur=im&foirm=Seol&hits=1&format=xml",headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
handler = urllib.request.urlopen(req)
is giving me the following exception:
Traceback (most recent call last):
File "C:/Users/Foo/lang/old/test.py", line 46, in <module>
rip()
File "C:/Users/Foo/lang/old/test.py", line 36, in rip
handler = urllib.request.urlopen(req)
File "C:\Python32\lib\urllib\request.py", line 138, in urlopen
return opener.open(url, data, timeout)
File "C:\Python32\lib\urllib\request.py", line 375, in open
response = meth(req, response)
File "C:\Python32\lib\urllib\request.py", line 487, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python32\lib\urllib\request.py", line 413, in error
return self._call_chain(*args)
File "C:\Python32\lib\urllib\request.py", line 347, in _call_chain
result = func(*args)
File "C:\Python32\lib\urllib\request.py", line 495, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 500: Internal Server Error
but it works fine in my browser, whats the issue?
The server is rather b0rken. It responds with a 500 error in the browser as well.
You can catch the exception and still read the response:
import urllib.request
from urllib.error import HTTPError
req = urllib.request.Request(url=r"http://borel.slu.edu/cgi-bin/cc.cgi?foirm_ionchur=im&foirm=Seol&hits=1&format=xml",headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
try:
handler = urllib.request.urlopen(req)
except HTTPError as e:
content = e.read()
When it happened to me I've reduced the plt.figure size parameter and it worked. It may be some odd parameter on your code that is not being able to be read.