My code is supposed to get from specific json like url (output webpage is offering is not JSON which is required). When I get it with connection A it returns me following error:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 20, in run
main_func(self.counter)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 166, in main_func
total=url_to_dict(url)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 79, in url_to_dict
data = urllib.request.urlopen(url).read().decode('utf-8')
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Interestingly, when I try to get info with connection B it works fine however i get following error after 10000-20000 iterations:
Exception in thread Thread-9:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1151, in _send_request
self.endheaders(body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1102, in endheaders
self._send_output(message_body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 711, in create_connection
raise err
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 702, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
I searched several hours internet for error with connection B Error with connection B mainly occurs because of connection problem or proxy. I tried this solution with several different proxies it did not work either gave the same error after some thousand iterations:
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
The problematic part is following :
class myThread (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
main_func(self.counter)
def url_to_dict(url):
hdr = {
'User-Agent': 'Chrome/60.0.3112.101 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Safari/537.11 Mozilla/55.0.2',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
data2= urllib.request.Request(url,headers= {'User-Agent': 'Mozilla/5.0'})
# proxy_support = urllib.request.ProxyHandler({"http": "http://61.233.25.166:80"})
# opener = urllib2.build_opener(proxy_support)
# urllib2.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})', data)[0]
json_data = json.loads(json_type_string)
total_page = json_data['data']['totalPage']
return json_data,total_page
def main_func(counter):
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
for x in range(len(url_list)):
url=url_list[x]
company_name=company_list[x]
total=url_to_dict(url)
total_page=total[1]
for y in range(int(total_page/10)):
index = url.find('config[page]=')
index2 = url.find('&config[reply')
k = y*10
url = url[:index+13] + str(counter+k) + url[index2:]
print(url)
data = url_to_dict(url)
parsed_data = get_data(data)
add_to_mongo(parsed_data,company_name)
Related
Language Ver: Python 3.6.3
IDE Ver: PyCharm 2017.2.3
I was trying to parse a weather website to print weather for a place. As I am learning Python, previously I used urllib.request.urlopen(url).read() and it worked. Now, I am modifying the code to BeautifulSoup4 and requests module. Below is my code:
from bs4 import *
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
data = requests.get(url)
soup = BeautifulSoup(data.text, "html.parser")
print(soup.find('div', {'class': 'info'}))
But each time I try to run the code it gives me following error:
Traceback (most recent call last):
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "", line 2, in raise_from
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 1009, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 871, in read
return self._sslobj.read(len, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.py", line 440, in send
timeout=timeout
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\retry.py", line 357, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "", line 2, in raise_from
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 1009, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 871, in read
return self._sslobj.read(len, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
urllib3.exceptions.ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:/Projects/Python/Practice/Practice1.py", line 5, in
data = requests.get(url)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.py", line 490, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
Process finished with exit code 1
What is this error and how to correct it? And why it worked in urllib, but not in requests?
I used your code straight up and I got the same error then I followed how the requests are sent in browser. Some servers don't respond if expected headers are not sent with request that they use as part of backend processing. Turns out the server was looking for a header called user-agent usually used to determine what client the request is from. Now, amended code below which works!
from bs4 import *
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, "html.parser")
Now you can play with your soup!
You can in fact pass more headers like accept, dnt, pragma, accept-language, cache-control etc. Explanation of these http headers are for another question, another time. Hope it helps :)
Try increasing the timeout parameter of your requests.get method :
requests.get(url, headers=headers, timeout=5)
But if your script is being blocked by the server to prevent scrapping attempts . If this is the case you can try faking a web browser by setting appropriate headers .
{"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://example.com"}
your final code
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://example.com"}
data = requests.get(url,headers=headers,timeout=5)
My code is supposed to get from specific json like url (output webpage is offering is not JSON which is required). When I get it with connection A it returns me following error:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 20, in run
main_func(self.counter)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 166, in main_func
total=url_to_dict(url)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 79, in url_to_dict
data = urllib.request.urlopen(url).read().decode('utf-8')
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Interestingly, when I try to get info with connection B it works fine however i get following error after 10000-20000 iterations:
Exception in thread Thread-9:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1151, in _send_request
self.endheaders(body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1102, in endheaders
self._send_output(message_body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 711, in create_connection
raise err
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 702, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
I searched several hours internet for error with connection B Error with connection B mainly occurs because of connection problem or proxy. I tried this solution with several different proxies it did not work either gave the same error after some thousand iterations:
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
The problematic part is following :
class myThread (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
main_func(self.counter)
def url_to_dict(url):
hdr = {
'User-Agent': 'Chrome/60.0.3112.101 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Safari/537.11 Mozilla/55.0.2',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
data2= urllib.request.Request(url,headers= {'User-Agent': 'Mozilla/5.0'})
# proxy_support = urllib.request.ProxyHandler({"http": "http://61.233.25.166:80"})
# opener = urllib2.build_opener(proxy_support)
# urllib2.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})', data)[0]
json_data = json.loads(json_type_string)
total_page = json_data['data']['totalPage']
return json_data,total_page
def main_func(counter):
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
for x in range(len(url_list)):
url=url_list[x]
company_name=company_list[x]
total=url_to_dict(url)
total_page=total[1]
for y in range(int(total_page/10)):
index = url.find('config[page]=')
index2 = url.find('&config[reply')
k = y*10
url = url[:index+13] + str(counter+k) + url[index2:]
print(url)
data = url_to_dict(url)
parsed_data = get_data(data)
add_to_mongo(parsed_data,company_name)
What can I do to fix this problem? Also, what is cause of getting Error 404 not found?
Thanks in advance
It's not the answer (still can't commenting) but did you try the 'requests' library? I guess it's more powerfull and newest, so ..
I am following Bokeh's User Guide.
In "Embedding Bokeh Server as a Library" at http://docs.bokeh.org/en/latest/docs/user_guide/server.html#embedding-bokeh-server-as-a-library
it refers to a demo where a Bokeh server is embedded in Flask (https://github.com/bokeh/bokeh/blob/0.12.6/examples/howto/server_embed/flask_embed.py)
It should be straightforward but I get a Tornado error if launched with python flask_embed.py. Anybody has an idea WHY?
The page on the browser is correctly launched but there is no plot.
This is the short error message:
ERROR:tornado.application:Uncaught exception GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:5006', method='GET', uri='/bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp', version='HTTP/1.1', remote_ip='127.0.0.1', headers={'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Host': 'localhost:5006', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Connection': 'keep-alive', 'Referer': 'http://localhost:8080/', 'Cookie': 'username-localhost-8888="2|1:0|10:1501067928|23:username-localhost-8888|44:Y2EwOTUzN2YzNWRiNGQyMDgxZWEyOGMzZDJkOTI4ZWY=|f4f981dd915dc777c70e605b7135bcbbc076b3fe3482999e5ca557cb4abd518e"; _xsrf=2|c711b8e7|f913ccc5c9cc32532c1e67bbd75b6051|1500889250'})
...
HTTPError: HTTP Error 400: Bad Request
ERROR:tornado.access:500 GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
And here the whole traceback:
Opening Flask app with embedded Bokeh application on http://localhost:8080/
ERROR:tornado.application:Uncaught exception GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:5006', method='GET', uri='/bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp', version='HTTP/1.1', remote_ip='127.0.0.1', headers={'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Host': 'localhost:5006', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Connection': 'keep-alive', 'Referer': 'http://localhost:8080/', 'Cookie': 'username-localhost-8888="2|1:0|10:1501067928|23:username-localhost-8888|44:Y2EwOTUzN2YzNWRiNGQyMDgxZWEyOGMzZDJkOTI4ZWY=|f4f981dd915dc777c70e605b7135bcbbc076b3fe3482999e5ca557cb4abd518e"; _xsrf=2|c711b8e7|f913ccc5c9cc32532c1e67bbd75b6051|1500889250'})
Traceback (most recent call last):
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/web.py", line 1511, in _execute
result = yield result
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/views/autoload_js_handler.py", line 31, in get
session = yield self.get_session()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/views/session_handler.py", line 40, in get_session
session = yield self.application_context.create_session_if_needed(session_id, self.request)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/tornado/gen.py", line 1069, in run
yielded = self.gen.send(value)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/server/application_context.py", line 177, in create_session_if_needed
self._application.initialize_document(doc)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/application/application.py", line 121, in initialize_document
h.modify_document(doc)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/bokeh/application/handlers/function.py", line 16, in modify_document
self._func(doc)
File "main.py", line 22, in modify_doc
df = pd.read_csv(data_url, parse_dates=True, index_col=0)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/parsers.py", line 655, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/parsers.py", line 392, in _read
filepath_or_buffer, encoding, compression)
File "/home/alessandro/git-files/python/study_graph2/env/local/lib/python2.7/site-packages/pandas/io/common.py", line 186, in get_filepath_or_buffer
req = _urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 435, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 473, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 400: Bad Request
ERROR:tornado.access:500 GET /bkapp/autoload.js?bokeh-autoload-element=3a711948-3668-4f63-8d0c-8cd1584fb92d&bokeh-app-path=/bkapp&bokeh-absolute-url=http://localhost:5006/bkapp (127.0.0.1) 425.75ms
When the page is served, the server tries to load CSV data from an external URL using Pandas. I'm not sure whether this example worked before, but right now it seems that pd.read_csv does not encode URL query, so the server is unable to treat characters > and <. You can either replace the characters manually (refer to https://en.wikipedia.org/wiki/Percent-encoding) or use some library for it, like Python's urllib.
import json
import string
import socket
import requests
from bs4 import BeautifulSoup
# Default header to be used first.
headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}
# Create a session using requests to log in.
with requests.Session() as s:
# Grab new headers and cookies from login page
t = s.get("http://minewind.com/forums/ucp.php?mode=login", headers=headers)
sid = t.cookies['phpbb3_qpac2_sid'] # Store sid to be used in POST data.
# POST data to be sent
payload = {"login": "Login",
"password": "*********",
"redirect": "./ucp.php?mode=login",
"redirect": "index.php",
"sid": sid,
"username": "myusername"
}
# Send POST data to the login page, including proper headers.
s1 = s.post("http://minewind.com/forums/ucp.php?mode=login", data=payload, headers=t.headers)
print (t.headers)
# Check to see if we are really logged in, WHICH WE ARENT!!!! ;_;
s2 = s.get("http://minewind.com/forums/index.php", headers=t.headers)
# Pretty up the code and grab links.
perty = BeautifulSoup(s2.content)
perty.prettify()
for links in perty.find_all('a'):
print (links.get('href'))
I finally configured the POST data properly to my knowledge, but now i'm getting some weird connection errors, any ideas? ERRORS:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 331, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "C:\Python33\lib\http\client.py", line 1143, in getresponse
response.begin()
File "C:\Python33\lib\http\client.py", line 354, in begin
version, status, reason = self._read_status()
File "C:\Python33\lib\http\client.py", line 316, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python33\lib\socket.py", line 297, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\adapters.py", line 362, in send
timeout=timeout
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 559, in urlopen
_pool=self, _stacktrace=stacktrace)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\util\retry.py", line 245, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\packages\six.py", line 309, in reraise
raise value.with_traceback(tb)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "C:\Python33\lib\http\client.py", line 1143, in getresponse
response.begin()
File "C:\Python33\lib\http\client.py", line 354, in begin
version, status, reason = self._read_status()
File "C:\Python33\lib\http\client.py", line 316, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python33\lib\socket.py", line 297, in readinto
return self._sock.recv_into(b)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, 'An existing con
nection was forcibly closed by the remote host', None, 10054))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Anthony\site.py", line 28, in <module>
s2 = s.get("http://minewind.com/forums/index.php", headers=t.headers)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 469, in get
return self.request('GET', url, **kwargs)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 457, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 569, in send
r = adapter.send(request, **kwargs)
File "C:\Python33\lib\site-packages\requests\adapters.py", line 407, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was for
cibly closed by the remote host', None, 10054))
Any thoughts on why i'm getting these 'connection aborted' errors?
You can just use the user-agent headers for this instance, I complicating it by grabbing the login page headers which wasn't necessary. Also you don't need to know the sid cookie beforehand like I thought you did. You can just include it with the POST data as empty. Just make sure you are inspecting which form data is being passed with firebug or similar utilities stated above.
import requests
from bs4 import BeautifulSoup
import sys
headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}
with requests.Session() as s:
payload = {"login": "Login",
"password": "mypassword",
"redirect": "./ucp.php?mode=login",
"redirect": "index.php",
"sid": "",
"username": "myusername"}
url = "http://minewind.com/forums/index.php"
s1 = s.post("http://minewind.com/forums/ucp.php?mode=login", data=payload, headers=headers)
s2 = s.get(url, headers=headers)
I am pulling websites from a list and want to test, whether they are up or down. The code below works fine as long as they are up, but as soon as something is wrong with one of these urls, I get an error message and the whole scrip stops.
What I want to achieve: Error message == website not working therefore print down and move to next item in list.
import urllib2
from urllib2 import Request, urlopen, HTTPError, URLError
def checkurl(z):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
link = "http://"+z
req = Request(link, headers = headers)
try:
page_open = urlopen(req)
except HTTPError, e:
print "down"
else:
print 'up'
#print urllib2.urlopen('http://'+z).read()
Traceback (most recent call last):
File "/home/user/Videos/python/onion/qweqweqweq.py", line 48, in <module>
checkurl(x)
File "/home/user/Videos/python/onion/qweqweqweq.py", line 23, in checkurl
page_open = urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 401, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 419, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 379, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1211, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1178, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 962, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 996, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 958, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 818, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 780, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 761, in connect
self.timeout, self.source_address)
File "/home/user/Videos/python/onion/qweqweqweq.py", line 5, in create_connection
sock.connect(address)
File "/usr/lib/python2.7/dist-packages/socks.py", line 369, in connect
self.__negotiatesocks5(destpair[0],destpair[1])
File "/usr/lib/python2.7/dist-packages/socks.py", line 236, in __negotiatesocks5
raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
TypeError: __init__() takes exactly 2 arguments (3 given)
You are catching HTTPError, but what is thrown is Socks5Error.
You're missing Socks5Error in your except clause. Look at the traceback:
raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
Note that this wouldn't have happened if you used requests instead of urllib2. The interface is a lot clearer, the documentation better.
In answer to "would it be possible to assume that the website is down regardless of the error", then this will do it:
req = Request(link, headers = headers)
try:
page_open = urlopen(req)
except:
print "down"
else:
print 'up'