urllib.request- HTTPError: HTTP Error 403: Forbidden - python

I'm trying to automate downloads of specific part prices and quantities from Octopart using Python. I'm able to convert the csv file with the specific part numbers I want to look up into line items and queries, but keep getting an HTTPError message when I try to send the queries to REST API for part matching. I entered in my apikey but since it still doesn't connect, I'm wondering if I wrote the url incorrectly. Any guidance would be appreciated.
Code:
# Send queries to REST API for part matching.
import json
import urllib.parse
import urllib.request
results = []
for i in range(0, len(queries), 20):
# Batch queries in groups of 20, query limit of
# parts match endpoint
batched_queries = queries[i: i + 20]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(batched_queries))
url += '&apikey=eb49732b'
data = urllib.request.urlopen(url)
response = json.loads(data)
# Record results for analysis
results.extend(response['results'])
Error:
HTTPError Traceback (most recent call last)
<ipython-input-43-cf5776fdc754> in <module>()
14 url = 'http://octopart.com/api/v3/parts/match?queries=%s' % urllib.parse.quote(json.dumps(batched_queries))
15 url += '&apikey=eb49732b'
---> 16 data = urllib.request.urlopen(url)
17 response = json.loads(data)
18
~\Documents\Software\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Documents\Software\Anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Documents\Software\Anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\Documents\Software\Anaconda\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
Thank you for your help!

check your API key or contact them and ask about permissions
When I tried with curl sample using your key, it also fails with 403
$ curl -G https://octopart.com/api/v3/parts/match -d queries="[{\"mpn\":\"SN74S74N\"}]" \
-d apikey=eb49732b \
-d pretty_print=true
{
"__class__": "ClientErrorResponse",
"message": "Forbidden request"
}
However with EXAMPLE_KEY the query above succeeds

Try the following code with your api key... if it doesn't work then your key is probably invalidated.
import json
import urllib
import urllib.parse
import urllib.request
queries = [
{'mpn': 'SN74S74N',
'reference': 'line1'},
{'sku': '67K1122',
'reference': 'line2'},
{'mpn_or_sku': 'SN74S74N',
'reference': 'line3'},
{'brand': 'Texas Instruments',
'mpn': 'SN74S74N',
'reference': 'line4'}
]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(queries))
url += "&include[]=specs"
# NOTE: Use your API key here (https://octopart.com/api/register)
url += '&apikey=<REPLACEME>'
data = urllib.request.urlopen(url).read()
response = json.loads(data)
# print request time (in milliseconds)
print("Response time: %s msec\n" % response['msec'])

Related

How to deal with the 'ascii' codec can't encode character '\xe9' error?

I am trying to download Excel files from the website. My code below:
import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")
for link in soup.find_all('a', href=True):
# print(link)
if 'xlsx' in link['href']:
print(link['href'])
url="https://www.elections.on.ca/"+link['href']
# print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
# print(file)
urllib.request.urlretrieve(url, file)
However, I get the following error when https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx is trying to be opened
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
8 file= url.split("/")[-1].split(".")[0]+".xlsx"
9 # print(file)
---> 10 urllib.request.urlretrieve(url, file)
...
UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).
EDIT: I tried the safeStr solution form UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128), but it does not work. Please see below:
def safeStr(obj):
try: return str(obj).encode('ascii', 'ignore').decode('ascii')
except: return ""
url="https://www.elections.on.ca/"+'/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
# print(url)
print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
url = safeStr(url)
print(url)
# print(file)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orlans%20076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
6 print(url)
7 # print(file)
----> 8 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I tried another solution from problem of urlretrieve cannot get image from url contains unicode string, but it also does not work:
url = "https://www.elections.on.ca/"+urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%2520Totals%2520From%2520Official%2520Tabulation%2520-%2520Orl%C3%A9ans%2520076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
2 #url = safeStr(url)
3 print(url)
----> 4 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I think this is a solution...
The problem is that the url you start with:
"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
is already url-quoted (e.g. spaces replaced by %20), but still contains non-ascii chars here Orléans
So the solution from this question will help us, but just applying urllib.parse.quote(...) results in twice-encoded spaces as %2520. That is why you get a 404 when requesting the processed url.
So first we need to unquote the url (i.e. %20 ->> " "), then quote it again - this time the accented char will be quoted too and it should work.
Try this:
path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca" + path
The result we get is:
https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orl%C3%A9ans%20076.xlsx
...should work now!

Python - bad gateway error based on API limit

I am trying to retrieve data from a CKAN API URL:
import urllib.request
import json
import pandas as pd
url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=dcf999c1-d394-4b57-a5e0-9d014a62e046&limit=1000000'
with urllib.request.urlopen(url) as response:
html = response.read()
result = json.loads(html)
df = pd.DataFrame(result['result']['records'])
But getting the following error:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-44-8484123eecdc> in <module>
2 import pandas as pd
3 url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=dcf999c1-d394-4b57-a5e0-9d014a62e046&limit=1000000'
----> 4 with urllib.request.urlopen(url) as response:
5 html = response.read()
6 result = json.loads(html)
~\miniconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\miniconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\miniconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\miniconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\miniconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\miniconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 502: Bad Gateway
Interestingly, if I use lower limit in the URL, e.g.:
url = 'https://...&limit=10000'
everything works fine. If I use no limit at all, it only retrieves the first 100 records.
Can anyone please explain why this is happening? Is this some server-side restriction? How can I go around this, so I can get the whole data set, regardless of how many records are included (there are frequent updates adding more records)?
Also, is this the right way to fetch data from a CKAN API? If not, I'd be glad to see how this should be done.
There are some limitations on CKAN api, if you need to query more that 100 records you need to set an offset and query as many times you need, like pagination.

Python: HTTPError: HTTP Error 403: Bad Behavior

I am trying to read a web page to extract contents from it. Please find below the code.
url = "http://www.sanjamar.com/product-categories/bar/bar-tools/"
html = urlopen(url).read()
soup = BeautifulSoup(html)
print(soup)
The last time I used with a different website, it worked. This time its throwing the following error.
HTTPError Traceback (most recent call last)
<ipython-input-83-ccdefd422a61> in <module>()
1 url = "http://www.sanjamar.com/product-categories/bar/bar-tools/"
----> 2 html = urlopen(url).read()
3 soup = BeautifulSoup(html)
4 print(soup)
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in urlopen(url, data,
timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in open(self, fullurl,
data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in http_response(self,
request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in error(self, proto, *
args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') +
orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it
makes
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in _call_chain(self,
chain,
kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in http_error_default(self,
req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Bad Behavior
I guess the issue is the website is blocking python. If not please let me know a solution.
Thanks

what have I done wrong parsing html with python urllib2 and beautifulsoup

Trying to scrape some links from google, and learn python
import urllib2
from bs4 import BeautifulSoup
response = urllib2.urlopen('http://www.google.com.au/search?q=python')
html = response.read()
print html
response.close()
What have I done wrong? I get the following error?
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-4-d990999e71f4> in <module>()
9
10 import urllib2
---> 11 response = urllib2.urlopen('http://www.google.com.au/search?q=python')
12 html = response.read()
13 print html
C:\Python27\lib\urllib2.pyc in urlopen(url, data, timeout)
124 if _opener is None:
125 _opener = build_opener()
--> 126 return _opener.open(url, data, timeout)
127
128 def install_opener(opener):
C:\Python27\lib\urllib2.pyc in open(self, fullurl, data, timeout)
395 for processor in self.process_response.get(protocol, []):
396 meth = getattr(processor, meth_name)
--> 397 response = meth(req, response)
398
399 return response
C:\Python27\lib\urllib2.pyc in http_response(self, request, response)
508 if not (200 <= code < 300):
509 response = self.parent.error(
--> 510 'http', request, response, code, msg, hdrs)
511
512 return response
C:\Python27\lib\urllib2.pyc in error(self, proto, *args)
433 if http_err:
434 args = (dict, 'default', 'http_error_default') + orig_args
--> 435 return self._call_chain(*args)
436
437 # XXX probably also want an abstract factory that knows when it makes
C:\Python27\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
367 func = getattr(handler, meth_name)
368
--> 369 result = func(*args)
370 if result is not None:
371 return result
C:\Python27\lib\urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
516 class HTTPDefaultErrorHandler(BaseHandler):
517 def http_error_default(self, req, fp, code, msg, hdrs):
--> 518 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
519
520 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
It looks like google does not allow that type of requests
Try:Requests or mechanize
You can easily manipulate your request headers (user agent, etc..)
Check, which is easier and more appropriate for you

503 error when trying to access Google Patents using python

Earlier today I was able to pull data from Google Patents using the code below
import urllib2
url = 'http://www.google.com/search?tbo=p&q=ininventor:"John-Mudd"&hl=en&tbm=pts&source=lnt&tbs=ptso:us'
req = urllib2.Request(url, headers={'User-Agent' : "foobar"})
response = urllib2.urlopen(req)
Now when I go to run it I get the following 503 error. I had only looped through this code maybe 30 times on it (i'm trying to get all the patents owned by a list of 30 people).
HTTPError Traceback (most recent call last)
<ipython-input-4-01f83e2c218f> in <module>()
----> 1 response = urllib2.urlopen(req)
C:\Python27\lib\urllib2.pyc in urlopen(url, data, timeout)
124 if _opener is None:
125 _opener = build_opener()
--> 126 return _opener.open(url, data, timeout)
127
128 def install_opener(opener):
C:\Python27\lib\urllib2.pyc in open(self, fullurl, data, timeout)
404 for processor in self.process_response.get(protocol, []):
405 meth = getattr(processor, meth_name)
--> 406 response = meth(req, response)
407
408 return response
C:\Python27\lib\urllib2.pyc in http_response(self, request, response)
517 if not (200 <= code < 300):
518 response = self.parent.error(
--> 519 'http', request, response, code, msg, hdrs)
520
521 return response
C:\Python27\lib\urllib2.pyc in error(self, proto, *args)
436 http_err = 0
437 args = (dict, proto, meth_name) + args
--> 438 result = self._call_chain(*args)
439 if result:
440 return result
C:\Python27\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
376 func = getattr(handler, meth_name)
377
--> 378 result = func(*args)
379 if result is not None:
380 return result
C:\Python27\lib\urllib2.pyc in http_error_302(self, req, fp, code, msg, headers)
623 fp.close()
624
--> 625 return self.parent.open(new, timeout=req.timeout)
626
627 http_error_301 = http_error_303 = http_error_307 = http_error_302
C:\Python27\lib\urllib2.pyc in open(self, fullurl, data, timeout)
404 for processor in self.process_response.get(protocol, []):
405 meth = getattr(processor, meth_name)
--> 406 response = meth(req, response)
407
408 return response
C:\Python27\lib\urllib2.pyc in http_response(self, request, response)
517 if not (200 <= code < 300):
518 response = self.parent.error(
--> 519 'http', request, response, code, msg, hdrs)
520
521 return response
C:\Python27\lib\urllib2.pyc in error(self, proto, *args)
442 if http_err:
443 args = (dict, 'default', 'http_error_default') + orig_args
--> 444 return self._call_chain(*args)
445
446 # XXX probably also want an abstract factory that knows when it makes
C:\Python27\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
376 func = getattr(handler, meth_name)
377
--> 378 result = func(*args)
379 if result is not None:
380 return result
C:\Python27\lib\urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
525 class HTTPDefaultErrorHandler(BaseHandler):
526 def http_error_default(self, req, fp, code, msg, hdrs):
--> 527 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
528
529 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 503: Service Unavailable
Google's TOS bans automated queries, sadly enough. It almost certainly detected that you were "up to no good."
source: https://support.google.com/websearch/answer/86640?hl=en
Shot in the dark guess:
Did you look to see if there was a "Retry-After header" in the response. It's a real possibility with 503.
From RFC 2616:
14.37 Retry-After
The Retry-After response-header field can be used with a 503 (Service
Unavailable) response to indicate how long the service is expected to
be unavailable to the requesting client. This field MAY also be used
with any 3xx (Redirection) response to indicate the minimum time the
user-agent is asked wait before issuing the redirected request. The
value of this field can be either an HTTP-date or an integer number of
seconds (in decimal) after the time of the response.
Retry-After = "Retry-After" ":" ( HTTP-date | delta-seconds )
Two examples of its use are
Retry-After: Fri, 31 Dec 1999 23:59:59 GMT
Retry-After: 120
In the latter example, the delay is 2 minutes.

Categories