I am trying to fetch and create a dataset from a dataset on GitHub based on code from hands on ml. The problem is however, when I try and run this code I keep getting an HTTP Error 404 error message. Not sure what could be causing this.
Here is a detailed traceback of the error message I am receiving:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-19-6ada1818e178> in <module>
----> 1 fetch_housing()
<ipython-input-15-981f9394002a> in fetch_housing(housing_url, housing_path)
5 tgz_path = os.path.join(housing_path,'housing.tgz')
6 print(tgz_path)
----> 7 urllib.request.urlretrieve(housing_url,tgz_path)
8 housing_tgz = tarfile.open(tgz_path)
9 housing_tgz.extractall(path=housing_path)
~/anaconda3/lib/python3.7/urllib/request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~/anaconda3/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~/anaconda3/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~/anaconda3/lib/python3.7/urllib/request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~/anaconda3/lib/python3.7/urllib/request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~/anaconda3/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~/anaconda3/lib/python3.7/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
This is the code I am using
download_root = 'https://raw.githubusercontent.com/ageron/hanson-ml/master/'
housing_path = os.path.join('datasets','housing')
housing_url = download_root+'datasets/housing/housing.tgz'
def fetch_housing(housing_url = housing_url,housing_path=housing_path):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path,'housing.tgz')
urllib.request.urlretrieve(housing_url,tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
Related
In my program, I need to access a website html and extract some French postal codes.
When I do this in my browser it is working, but when I try to access it with Python I can't stop having an internal error HTTP 500
Do you think it is the website blocking access or is it my program problem?
Thanks
Below is my code:
import requests
import urllib
import re
link = "https://annuaire.118712.fr/magasin/mcdonald-s_1"
f = urllib.request.urlopen(link)
myfile = f.read()
for result in re.findall(r"postalCode\": \"(\d{5})", str(myfile)):
print(result)
And below is the error I get:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-2-76fbd165889e> in <module>
5
6 link = "https://annuaire.118712.fr/magasin/mcdonald-s_1"
----> 7 f = urllib.request.urlopen(link)
8 myfile = f.read()
9 for result in re.findall(r"postalCode\": \"(\d{5})", str(myfile)):
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
561 http_err = 0
562 args = (dict, proto, meth_name) + args
--> 563 result = self._call_chain(*args)
564 if result:
565 return result
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_302(self, req, fp, code, msg, headers)
753 fp.close()
754
--> 755 return self.parent.open(new, timeout=req.timeout)
756
757 http_error_301 = http_error_303 = http_error_307 = http_error_302
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 500: Internal Server Error
use requests library instead urllib is more friendly.
That code works fine:
import requests
import re
link = "https://annuaire.118712.fr/magasin/mcdonald-s_1"
response = requests.get(url=link)
if response.ok:
data = response.text
for result in re.findall(r"postalCode\": \"(\d{5})", data):
print(result)
else:
print("Http Error: "+str(response.status_code))
exit(1)
Results:
01170
01700
01300
01800
01210
01200
01170
01700
01330
01000
01440
01000
01000
01500
01100
01300
01300
01300
01170
01710
I am using Tika to read PDFs and my code was working until yesterday. Now when I runt the same code I get errors and apparently Tika can't find the Tika server jar file. I am using the following code to read the PDF
import tika
from tika import parser
tika.initVM()
parsed = parser.from_file('my_pdf_file.pdf')
The error trace is below
2019-06-22 05:54:08,735 [MainThread ] [INFO ] Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /tmp/tika-server.jar.
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tika/tika.py in getRemoteJar(urlOrPath, destPath)
715 try:
--> 716 urlretrieve(urlOrPath, destPath)
717 except IOError:
19 frames
/usr/lib/python3.6/urllib/request.py in urlretrieve(url, filename, reporthook, data)
247
--> 248 with contextlib.closing(urlopen(url, data)) as fp:
249 headers = fp.info()
/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
/usr/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
HTTPError: HTTP Error 504: Gateway Time-out
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
<ipython-input-5-f304ccbde1d0> in <module>()
13
14 # Read the ;PDF file
---> 15 parsed = parser.from_file('/content/gdrive/My Drive/Colab Notebooks/data/sample_pdf_for_excel.pdf')
16 # content = parsed["content"].strip()
17
/usr/local/lib/python3.6/dist-packages/tika/parser.py in from_file(filename, serverEndpoint, xmlContent, headers, config_path)
34 '''
35 if not xmlContent:
---> 36 jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
37 else:
38 jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
/usr/local/lib/python3.6/dist-packages/tika/tika.py in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path)
326 if service == '/tika': responseMimeType = 'text/plain'
327 status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
--> 328 headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
329
330 if file_type == 'remote': os.unlink(path)
/usr/local/lib/python3.6/dist-packages/tika/tika.py in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path)
520 global TikaClientOnly
521 if not TikaClientOnly:
--> 522 serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
523
524 serviceUrl = serverEndpoint + service
/usr/local/lib/python3.6/dist-packages/tika/tika.py in checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
569 if not alreadyRunning:
570 if not os.path.isfile(jarPath) and urlp.scheme != '':
--> 571 getRemoteJar(tikaServerJar, jarPath)
572
573 if not checkJarSig(tikaServerJar, jarPath):
/usr/local/lib/python3.6/dist-packages/tika/tika.py in getRemoteJar(urlOrPath, destPath)
724 if os.path.exists(destPath) and os.path.isfile(destPath):
725 os.remove(destPath)
--> 726 urlretrieve(urlOrPath, destPath)
727
728 return (destPath, 'remote')
/usr/lib/python3.6/urllib/request.py in urlretrieve(url, filename, reporthook, data)
246 url_type, path = splittype(url)
247
--> 248 with contextlib.closing(urlopen(url, data)) as fp:
249 headers = fp.info()
250
/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes
/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
/usr/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 504: Gateway Time-out
Java version
!java -version
openjdk version "11.0.3" 2019-04-16
OpenJDK Runtime Environment (build 11.0.3+7-Ubuntu-1ubuntu218.04.1)
OpenJDK 64-Bit Server VM (build 11.0.3+7-Ubuntu-1ubuntu218.04.1, mixed mode, sharing)
On another machine where I tried the same code
> java -version
java version "1.8.0_172"
Java(TM) SE Runtime Environment (build 1.8.0_172-b11)
Java HotSpot(TM) Client VM (build 25.172-b11, mixed mode, sharing)
When I click the java link (Maven search) http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar it gives 504 Gateway Time-out
Please suggest.
Change the url to download via the appropriate environment variable:
import os
os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser
Reference: https://github.com/chrismattmann/tika-python/issues/230#issuecomment-504704922
I am working with solr and python.I use urllib library to get http data.
I write code as:
from urllib2 import *
connection = urlopen('http://localhost:8983/solr/data/select?indent=on&q=sender_name:*AXI*&wt=json')
It was working fine but when I apply more query filter string as follows as:
from urllib2 import *
connection = urlopen('http://localhost:8983/solr/data/select?indent=on&q=sender_name:*AXI* AND message:*Avbl*&wt=json')
I got error as:
HTTPError Traceback (most recent call last)
<ipython-input-22-6dad7f9847f1> in <module>()
----> 1 connection = urlopen('http://localhost:8983/solr/data/select?indent=on&q=sender_name:*AXI* AND message:*Avbl*&wt=json')
/usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
433 for processor in self.process_response.get(protocol, []):
434 meth = getattr(processor, meth_name)
--> 435 response = meth(req, response)
436
437 return response
/usr/lib/python2.7/urllib2.pyc in http_response(self, request, response)
546 if not (200 <= code < 300):
547 response = self.parent.error(
--> 548 'http', request, response, code, msg, hdrs)
549
550 return response
/usr/lib/python2.7/urllib2.pyc in error(self, proto, *args)
471 if http_err:
472 args = (dict, 'default', 'http_error_default') + orig_args
--> 473 return self._call_chain(*args)
474
475 # XXX probably also want an abstract factory that knows when it makes
/usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
405 func = getattr(handler, meth_name)
406
--> 407 result = func(*args)
408 if result is not None:
409 return result
/usr/lib/python2.7/urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
554 class HTTPDefaultErrorHandler(BaseHandler):
555 def http_error_default(self, req, fp, code, msg, hdrs):
--> 556 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
557
558 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 400: Unknown Version
How can this error be resolved.
An URL does not contain space characters. You can replace space characters with "%20" when it is a query.
I am trying to read a web page to extract contents from it. Please find below the code.
url = "http://www.sanjamar.com/product-categories/bar/bar-tools/"
html = urlopen(url).read()
soup = BeautifulSoup(html)
print(soup)
The last time I used with a different website, it worked. This time its throwing the following error.
HTTPError Traceback (most recent call last)
<ipython-input-83-ccdefd422a61> in <module>()
1 url = "http://www.sanjamar.com/product-categories/bar/bar-tools/"
----> 2 html = urlopen(url).read()
3 soup = BeautifulSoup(html)
4 print(soup)
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in urlopen(url, data,
timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in open(self, fullurl,
data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in http_response(self,
request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in error(self, proto, *
args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') +
orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it
makes
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in _call_chain(self,
chain,
kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
C:\Users\Santosh\Anaconda3\lib\urllib\request.py in http_error_default(self,
req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Bad Behavior
I guess the issue is the website is blocking python. If not please let me know a solution.
Thanks
I am trying to run adaboost algorithm on the caltech101 dataset. I want to use sklearn in python. For importing dataset into python from mldata.org, sklearn gives sklearn.datasets.fetch_mldata() but I am getting a 404 error however dataset is on the mldata site.
I tried following but got 404 error.
from sklearn.datasets import fetch_mldata
dataDict = fetch_mldata('caltech101-30')
from sklearn.datasets import fetch_mldata
dataDict = fetch_mldata('caltech101 30')
Error:
HTTPError Traceback (most recent call last)
<ipython-input-46-939c88ab9518> in <module>()
2
3
----> 4 dataDict = fetch_mldata('caltech101 30')
C:\Anaconda\lib\site-packages\sklearn\datasets\mldata.pyc in fetch_mldata(dataname, target_name, data_name, transpose_data, data_home)
140 urlname = MLDATA_BASE_URL % quote(dataname)
141 try:
--> 142 mldata_url = urlopen(urlname)
143 except HTTPError as e:
144 if e.code == 404:
C:\Anaconda\lib\urllib2.pyc in urlopen(url, data, timeout)
125 if _opener is None:
126 _opener = build_opener()
--> 127 return _opener.open(url, data, timeout)
128
129 def install_opener(opener):
C:\Anaconda\lib\urllib2.pyc in open(self, fullurl, data, timeout)
408 for processor in self.process_response.get(protocol, []):
409 meth = getattr(processor, meth_name)
--> 410 response = meth(req, response)
411
412 return response
C:\Anaconda\lib\urllib2.pyc in http_response(self, request, response)
521 if not (200 <= code < 300):
522 response = self.parent.error(
--> 523 'http', request, response, code, msg, hdrs)
524
525 return response
C:\Anaconda\lib\urllib2.pyc in error(self, proto, *args)
440 http_err = 0
441 args = (dict, proto, meth_name) + args
--> 442 result = self._call_chain(*args)
443 if result:
444 return result
C:\Anaconda\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
380 func = getattr(handler, meth_name)
381
--> 382 result = func(*args)
383 if result is not None:
384 return result
C:\Anaconda\lib\urllib2.pyc in http_error_302(self, req, fp, code, msg, headers)
627 fp.close()
628
--> 629 return self.parent.open(new, timeout=req.timeout)
630
631 http_error_301 = http_error_303 = http_error_307 = http_error_302
C:\Anaconda\lib\urllib2.pyc in open(self, fullurl, data, timeout)
408 for processor in self.process_response.get(protocol, []):
409 meth = getattr(processor, meth_name)
--> 410 response = meth(req, response)
411
412 return response
C:\Anaconda\lib\urllib2.pyc in http_response(self, request, response)
521 if not (200 <= code < 300):
522 response = self.parent.error(
--> 523 'http', request, response, code, msg, hdrs)
524
525 return response
C:\Anaconda\lib\urllib2.pyc in error(self, proto, *args)
446 if http_err:
447 args = (dict, 'default', 'http_error_default') + orig_args
--> 448 return self._call_chain(*args)
449
450 # XXX probably also want an abstract factory that knows when it makes
C:\Anaconda\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
380 func = getattr(handler, meth_name)
381
--> 382 result = func(*args)
383 if result is not None:
384 return result
C:\Anaconda\lib\urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
529 class HTTPDefaultErrorHandler(BaseHandler):
530 def http_error_default(self, req, fp, code, msg, hdrs):
--> 531 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
532
533 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Dataset 'caltech101-30' not found on mldata.org.