I met UnicodeEncodeError while crawling Wikipedia dump json file.
Here are my code snippet and the error message.
It seems like the character 'é' cause this problem. However, I do not know how to solve this issue.
import urllib2
import json
# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
i = i+1
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
titlename = name.replace(" ", "_")
print titlename
title = "titles="+titlename
content = "prop=revisions&rvprop=content"
dataformat = "format=json"
# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
print query
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
# print wikisource
wikijson = json.loads(wikisource)
jsonfilename = './json/'+titlename+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Error message:
Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
21 query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
22 print query
---> 23 wikiresponse = urllib2.urlopen(query)
24 wikisource = wikiresponse.read()
25 # print wikisource
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
1238 def https_open(self, req):
1239 return self.do_open(httplib.HTTPSConnection, req,
-> 1240 context=self._context)
1241
1242 https_request = AbstractHTTPHandler.do_request_
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
867 datablock = data.read(blocksize)
868 else:
--> 869 self.sock.sendall(data)
870
871 def _output(self, s):
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
719 count = 0
720 while (count < amount):
--> 721 v = self.send(data[count:])
722 count += v
723 return amount
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
685 self.__class__)
686 try:
--> 687 v = self._sslobj.write(data)
688 except SSLError as x:
689 if x.args[0] == SSL_ERROR_WANT_READ:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)
However, below simple & direct code without getting a title from a list, just works without any issues.
import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Don't mix Unicode and bytestrings: use Unicode strings to work with text in Python.
Don't create urls by hand, use urllib functions such as quote(), urlencode(). Also, consider functions from urlparse module such as urljoin(), urlunsplit().
You've already requested json format, no need to parse it, only to dump it back immediately using the same format; you could use shutil.copyfileobj() to copy file-like objects. You could check the result file later, to make sure that it has been downloaded correctly.
Putting it all together, here's how to save a wiki-page with a given title to a file in JSON format:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj
def urlretrieve(url, filename, chunksize=8096):
with closing(urlopen(url)) as response, open(filename, 'wb') as file:
copyfileobj(response, file, chunksize)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
Note:
you don't need to .replace(' ', '_') in this case
os.path.join('json', name + '.json') line mixes bytestrings ('json', '.json') and Unicode (type(name) == unicode). It is ok here, because both 'json' and '.json' are ascii-only literals in the source code
# -*- coding: utf-8 -*- encoding declaration affects only characters that appear literally in your Python source code e.g., it is accidental that the query string also uses the same encoding in this particular case. The encoding of your source code has no relation with a character encoding that might be used for filenames, or to transfer data over http, or to write Unicode text to terminal, etc (all these encodings may be different from each other).
In principle, you could have used urllib.urlretrieve(url, filename) here instead of urlopen + copyfile but urllib.urlretrieve() behavior is different from urllib2.urlopen() on Python 2
Here's the same code using requests:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from urllib import quote
import requests # $ pip install requests
def urlretrieve(url, filename, chunksize=8096):
r = requests.get(url, stream=True)
r.raise_for_status() # raise on http error
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunksize):
f.write(chunk)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
However, below simple & direct code without getting a title from a list, just works without any issues.
Your code uses non-ascii bytestring literals (illegal in Python 3). There is no encoding error because all data is bytes already. The issue with using bytestrings is that it breaks if different environment may use different character encodings and they do (you can't expect that everything uses utf-8 however desirable it might be). Also, the query part should be properly encoded e.g., é should be sent as '%C3%A9'.
Unrelated: to download several web-pages at once, you could use a thread pool:
from multiprocessing.dummy import Pool # use threads
def download(name):
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
pool = Pool(4) # download 4 titles concurrently
for _ in pool.imap_unordered(download, mergel, chunksize=100):
pass
It is polite to set maxlag query parameter and respect Retry-After http header. There are several wrappers for Wikipedia API that might do it for you.
Related
I am trying to get the HTML content of a page with requests, but it results in UnicodeDecodeError. The reproducible code:
import requests
import urllib
url = "https://www.unique.nl/vacature/coördinator-facilitair-(v2037635)"
Attempt 1:
requests.get(url)
Attempt 2:
requests.get(requests.utils.requote_uri(url))
Both result in UnicodeDecodeError
Attempt 3:
requests.get(urllib.parse.quote(url))
Attempt 4:
requests.get(urllib.parse.quote(url.encode("Latin-1"), ":/"))
What am I missing here. Also encoding it to utf-8, latin1 or unicode_escape, does not work.
Full error message:
File /usr/local/lib/python3.9/site-packages/requests/api.py:75, in get(url, params, **kwargs)
64 def get(url, params=None, **kwargs):
65 r"""Sends a GET request.
66
67 :param url: URL for the new :class:`Request` object.
(...)
72 :rtype: requests.Response
73 """
---> 75 return request('get', url, params=params, **kwargs)
File /usr/local/lib/python3.9/site-packages/requests/api.py:61, in request(method, url, **kwargs)
57 # By using the 'with' statement we are sure the session is closed, thus we
58 # avoid leaving sockets open which can trigger a ResourceWarning in some
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
File /usr/local/lib/python3.9/site-packages/requests/sessions.py:542, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
537 send_kwargs = {
538 'timeout': timeout,
539 'allow_redirects': allow_redirects,
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
544 return resp
File /usr/local/lib/python3.9/site-packages/requests/sessions.py:677, in Session.send(self, request, **kwargs)
674 if allow_redirects:
675 # Redirect resolving generator.
676 gen = self.resolve_redirects(r, request, **kwargs)
--> 677 history = [resp for resp in gen]
678 else:
679 history = []
File /usr/local/lib/python3.9/site-packages/requests/sessions.py:677, in <listcomp>(.0)
674 if allow_redirects:
675 # Redirect resolving generator.
676 gen = self.resolve_redirects(r, request, **kwargs)
--> 677 history = [resp for resp in gen]
678 else:
679 history = []
File /usr/local/lib/python3.9/site-packages/requests/sessions.py:150, in SessionRedirectMixin.resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)
146 """Receives a Response. Returns a generator of Responses or Requests."""
148 hist = [] # keep track of history
--> 150 url = self.get_redirect_target(resp)
151 previous_fragment = urlparse(req.url).fragment
152 while url:
File /usr/local/lib/python3.9/site-packages/requests/sessions.py:116, in SessionRedirectMixin.get_redirect_target(self, resp)
114 if is_py3:
115 location = location.encode('latin1')
--> 116 return to_native_string(location, 'utf8')
117 return None
File /usr/local/lib/python3.9/site-packages/requests/_internal_utils.py:25, in to_native_string(string, encoding)
23 out = string.encode(encoding)
24 else:
---> 25 out = string.decode(encoding)
27 return out
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 29: invalid start byte
It's not the request URL that's the problem, it's the response that requests can't parse. Here are the response headers of that URL:
HTTP/2 301
content-type: text/html; charset=utf-8
date: Tue, 27 Dec 2022 07:37:34 GMT
server: Microsoft-IIS/10.0
location: https://unique.nl/vacature/co?rdinator-facilitair-(v2037635)
content-length: 184
arr-disable-session-affinity: true
The location header contains a URL with unencoded non-ASCII characters. That is the problem. URLs by specification may not contain non-ASCII characters. Standards conforming HTTP clients are within their right to crash on this malformed response. The URL must be percent-encoded.
Other clients may not crash because they treat the response in some other way that doesn't happen to cause a problem, but it's still the response that's deviating from the standard.
I am trying to download Excel files from the website. My code below:
import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")
for link in soup.find_all('a', href=True):
# print(link)
if 'xlsx' in link['href']:
print(link['href'])
url="https://www.elections.on.ca/"+link['href']
# print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
# print(file)
urllib.request.urlretrieve(url, file)
However, I get the following error when https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx is trying to be opened
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
8 file= url.split("/")[-1].split(".")[0]+".xlsx"
9 # print(file)
---> 10 urllib.request.urlretrieve(url, file)
...
UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).
EDIT: I tried the safeStr solution form UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128), but it does not work. Please see below:
def safeStr(obj):
try: return str(obj).encode('ascii', 'ignore').decode('ascii')
except: return ""
url="https://www.elections.on.ca/"+'/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
# print(url)
print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
url = safeStr(url)
print(url)
# print(file)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orlans%20076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
6 print(url)
7 # print(file)
----> 8 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I tried another solution from problem of urlretrieve cannot get image from url contains unicode string, but it also does not work:
url = "https://www.elections.on.ca/"+urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%2520Totals%2520From%2520Official%2520Tabulation%2520-%2520Orl%C3%A9ans%2520076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
2 #url = safeStr(url)
3 print(url)
----> 4 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I think this is a solution...
The problem is that the url you start with:
"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
is already url-quoted (e.g. spaces replaced by %20), but still contains non-ascii chars here Orléans
So the solution from this question will help us, but just applying urllib.parse.quote(...) results in twice-encoded spaces as %2520. That is why you get a 404 when requesting the processed url.
So first we need to unquote the url (i.e. %20 ->> " "), then quote it again - this time the accented char will be quoted too and it should work.
Try this:
path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca" + path
The result we get is:
https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orl%C3%A9ans%20076.xlsx
...should work now!
I have recently started my Python journey and stackoverflow has helped me a lot in resolving most of the issues I came across. However, this is one that I don't seem to be able to catch, despite trying the different solutions suggested here.
I am collecting urls from a website in a list. My next step is to go through the urls and download them if they don't already exist in the folder. However, some of the URLs contain non-ascii characters, such as ú, é, ç. Which leads to the unicode error below.
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
I escaped it for now with try/except but would need to download those manually.
When I use .encode('utf-8') it also result in an error: "TypeError: cannot use a string pattern on a bytes-like object".
This is my code:
import os
import urllib
dict = (this includes a large dictionary scraped from a website)
links = []
for d in dict :
links.append(d["EncodedAbsUrl"])
# For every line in the file
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
print(filename + " could not be saved.")
pass
else:
print(filename + " already exists.")
Edit
Based on Ardiya's suggestion in the comments (thanks a million for that), I have changed used the urllib.parse.quote_plus method. This seems to work but also returns an http error 400. Revised code now reads:
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
urllib.request.urlretrieve(new_url, filename)
else:
print(filename + " already exists.")
For example, the following link is in the source dictionary: https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%20Perú%20castellano.pdf is translated into https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%2520Per%C3%BA%2520castellano.pdf which does not properly work.
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
25 try:
---> 26 urllib.request.urlretrieve(url, filename)
27 except UnicodeEncodeError:
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
541 protocol = req.type
--> 542 result = self._call_chain(self.handle_open, protocol, protocol +
543 '_open', req)
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
~\Anaconda3\lib\urllib\request.py in https_open(self, req)
1392 def https_open(self, req):
-> 1393 return self.do_open(http.client.HTTPSConnection, req,
1394 context=self._context, check_hostname=self._check_hostname)
~\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1349 try:
-> 1350 h.request(req.get_method(), req.selector, req.data, headers,
1351 encode_chunked=req.has_header('Transfer-encoding'))
~\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1254 """Send a complete request to the server."""
-> 1255 self._send_request(method, url, body, headers, encode_chunked)
1256
~\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1265
-> 1266 self.putrequest(method, url, **skips)
1267
~\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1103
-> 1104 self._output(self._encode_request(request))
1105
~\Anaconda3\lib\http\client.py in _encode_request(self, request)
1183 # ASCII also helps prevent CVE-2019-9740.
-> 1184 return request.encode('ascii')
1185
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
27 except UnicodeEncodeError:
28 new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
---> 29 urllib.request.urlretrieve(new_url, filename)
30 else:
31 print(filename + " already exists.")
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 400: Bad Request
Your URL is only partially encoded. Try replacing every %20 with a literal space and then URL-encoding the whole thing.
if not os.path.isfile(filename):
try:
head, tail = url.rsplit('/', 1)
url = '%s/%s' % (head, urllib.parse.quote(tail.replace('%20', ' ')))
urllib.request.urlretrieve(url, filename)
The difference between urllib.parse.quote and urllib.parse.quote_plus is that the latter will replace spaces with +, whereas the former will URL-encode them back to %20.
I am trying to retrieve data from a CKAN API URL:
import urllib.request
import json
import pandas as pd
url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=dcf999c1-d394-4b57-a5e0-9d014a62e046&limit=1000000'
with urllib.request.urlopen(url) as response:
html = response.read()
result = json.loads(html)
df = pd.DataFrame(result['result']['records'])
But getting the following error:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-44-8484123eecdc> in <module>
2 import pandas as pd
3 url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=dcf999c1-d394-4b57-a5e0-9d014a62e046&limit=1000000'
----> 4 with urllib.request.urlopen(url) as response:
5 html = response.read()
6 result = json.loads(html)
~\miniconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\miniconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\miniconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\miniconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\miniconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\miniconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 502: Bad Gateway
Interestingly, if I use lower limit in the URL, e.g.:
url = 'https://...&limit=10000'
everything works fine. If I use no limit at all, it only retrieves the first 100 records.
Can anyone please explain why this is happening? Is this some server-side restriction? How can I go around this, so I can get the whole data set, regardless of how many records are included (there are frequent updates adding more records)?
Also, is this the right way to fetch data from a CKAN API? If not, I'd be glad to see how this should be done.
There are some limitations on CKAN api, if you need to query more that 100 records you need to set an offset and query as many times you need, like pagination.
I'm trying to automate downloads of specific part prices and quantities from Octopart using Python. I'm able to convert the csv file with the specific part numbers I want to look up into line items and queries, but keep getting an HTTPError message when I try to send the queries to REST API for part matching. I entered in my apikey but since it still doesn't connect, I'm wondering if I wrote the url incorrectly. Any guidance would be appreciated.
Code:
# Send queries to REST API for part matching.
import json
import urllib.parse
import urllib.request
results = []
for i in range(0, len(queries), 20):
# Batch queries in groups of 20, query limit of
# parts match endpoint
batched_queries = queries[i: i + 20]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(batched_queries))
url += '&apikey=eb49732b'
data = urllib.request.urlopen(url)
response = json.loads(data)
# Record results for analysis
results.extend(response['results'])
Error:
HTTPError Traceback (most recent call last)
<ipython-input-43-cf5776fdc754> in <module>()
14 url = 'http://octopart.com/api/v3/parts/match?queries=%s' % urllib.parse.quote(json.dumps(batched_queries))
15 url += '&apikey=eb49732b'
---> 16 data = urllib.request.urlopen(url)
17 response = json.loads(data)
18
~\Documents\Software\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Documents\Software\Anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Documents\Software\Anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\Documents\Software\Anaconda\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
Thank you for your help!
check your API key or contact them and ask about permissions
When I tried with curl sample using your key, it also fails with 403
$ curl -G https://octopart.com/api/v3/parts/match -d queries="[{\"mpn\":\"SN74S74N\"}]" \
-d apikey=eb49732b \
-d pretty_print=true
{
"__class__": "ClientErrorResponse",
"message": "Forbidden request"
}
However with EXAMPLE_KEY the query above succeeds
Try the following code with your api key... if it doesn't work then your key is probably invalidated.
import json
import urllib
import urllib.parse
import urllib.request
queries = [
{'mpn': 'SN74S74N',
'reference': 'line1'},
{'sku': '67K1122',
'reference': 'line2'},
{'mpn_or_sku': 'SN74S74N',
'reference': 'line3'},
{'brand': 'Texas Instruments',
'mpn': 'SN74S74N',
'reference': 'line4'}
]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(queries))
url += "&include[]=specs"
# NOTE: Use your API key here (https://octopart.com/api/register)
url += '&apikey=<REPLACEME>'
data = urllib.request.urlopen(url).read()
response = json.loads(data)
# print request time (in milliseconds)
print("Response time: %s msec\n" % response['msec'])