SSL errors when sending file from Azure databricks to SharePoint - python

We are working in a Python notebook on Databricks and want to send a file to a SharePoint site.
To achieve this, we obtained a client_id and client_secret from
https://<SP_domain>.sharepoint.com/sites/<my_site_name>/_layouts/15/appregnew.aspx
Locally, I can successfully send a file to SharePoint using these secrets. On DataBricks, I receive SSL Errors.
Normally, something like verify=false within the request can be provided, ignoring SSL certificate checks (if that is the actual issue). But this does not seem to be supported in the Python package that I am using: Office365-REST-Python-Client
The message of the errors that are received without any attempt to circumvent the issue.
SSLError: HTTPSConnectionPool(host='<SP_domain>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<my sites name>(Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Reproducible code
sharepoint_url = 'https://....sharepoint.com/sites/...'
client_credentials = ClientCredential(client_id=, client_secret=)
ctx = ClientContext(sharepoint_url).with_credentials(client_credentials)
web = ctx.web
ctx.load(web)
ctx.execute_query() # <<< Crashes here
print(web.properties["Url"])
Results in:
AttributeError: 'NoneType' object has no attribute 'text'
Actual (not the last) error states:
MaxRetryError: HTTPSConnectionPool(host='nsdigitaal.sharepoint.com', port=443): Max retries exceeded with url: /sites/Team-Camerainspectie (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Full stack (sorry in advance :P)
---------------------------------------------------------------------------
SSLEOFError Traceback (most recent call last)
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
381 try:
--> 382 self._validate_conn(conn)
383 except (SocketTimeout, BaseSSLError) as e:
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
1009 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1010 conn.connect()
1011
/databricks/python/lib/python3.9/site-packages/urllib3/connection.py in connect(self)
415
--> 416 self.sock = ssl_wrap_socket(
417 sock=conn,
/databricks/python/lib/python3.9/site-packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
/databricks/python/lib/python3.9/site-packages/urllib3/util/ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
/usr/lib/python3.9/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
499 # ctx._wrap_socket()
--> 500 return self.sslsocket_class._create(
501 sock=sock,
/usr/lib/python3.9/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1039 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1040 self.do_handshake()
1041 except (OSError, ValueError):
/usr/lib/python3.9/ssl.py in do_handshake(self, block)
1308 self.settimeout(None)
-> 1309 self._sslobj.do_handshake()
1310 finally:
SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1129)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
/databricks/python/lib/python3.9/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
754
--> 755 retries = retries.increment(
756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
/databricks/python/lib/python3.9/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
573 if new_retry.is_exhausted():
--> 574 raise MaxRetryError(_pool, url, error or ResponseError(cause))
575
MaxRetryError: HTTPSConnectionPool(host='<tenant name>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<site name> (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in get_app_only_access_token(self)
40 try:
---> 41 realm = self._get_realm_from_target_url()
42 url_info = urlparse(self.url)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in _get_realm_from_target_url(self)
69 def _get_realm_from_target_url(self):
---> 70 response = requests.head(url=self.url, headers={'Authorization': 'Bearer'})
71 return self.process_realm_response(response)
/databricks/python/lib/python3.9/site-packages/requests/api.py in head(url, **kwargs)
101 kwargs.setdefault('allow_redirects', False)
--> 102 return request('head', url, **kwargs)
103
/databricks/python/lib/python3.9/site-packages/requests/api.py in request(method, url, **kwargs)
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
/databricks/python/lib/python3.9/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
/databricks/python/lib/python3.9/site-packages/requests/sessions.py in send(self, request, **kwargs)
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
/databricks/python/lib/python3.9/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
513 # This branch is for urllib3 v1.22 and later.
--> 514 raise SSLError(e, request=request)
515
SSLError: HTTPSConnectionPool(host='<tenant name>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<site name> (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<command-4083654498839573> in <cell line: 14>()
12 web = ctx.web
13 ctx.load(web)
---> 14 ctx.execute_query()
15 print(web.properties["Url"])
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_runtime_context.py in execute_query(self)
145 def execute_query(self):
146 """Submit request(s) to the server"""
--> 147 self.pending_request().execute_query()
148
149 def add_query(self, query):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_request.py in execute_query(self)
72 request = self.build_request(qry)
73 self.beforeExecute.notify(request)
---> 74 response = self.execute_request_direct(request)
75 response.raise_for_status()
76 self.process_response(response)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/odata/request.py in execute_request_direct(self, request)
34 """
35 self._build_specific_request(request)
---> 36 return super(ODataRequest, self).execute_request_direct(request)
37
38 def build_request(self, query):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_request.py in execute_request_direct(self, request)
84 :type request: office365.runtime.http.request_options.RequestOptions
85 """
---> 86 self.context.authenticate_request(request)
87 if request.method == HttpMethod.Post:
88 if request.is_bytes or request.is_file:
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/sharepoint/client_context.py in authenticate_request(self, request)
238
239 def authenticate_request(self, request):
--> 240 self.authentication_context.authenticate_request(request)
241
242 def _build_modification_query(self, request):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/authentication_context.py in authenticate_request(self, request)
95 :type request: office365.runtime.http.request_options.RequestOptions
96 """
---> 97 self._provider.authenticate_request(request)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in authenticate_request(self, request)
29 :type request: office365.runtime.http.request_options.RequestOptions
30 """
---> 31 self.ensure_app_only_access_token()
32 request.set_header('Authorization', self._get_authorization_header())
33
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in ensure_app_only_access_token(self)
34 def ensure_app_only_access_token(self):
35 if self._cached_token is None:
---> 36 self._cached_token = self.get_app_only_access_token()
37 return self._cached_token and self._cached_token.is_valid
38
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in get_app_only_access_token(self)
43 return self._get_app_only_access_token(url_info.hostname, realm)
44 except requests.exceptions.RequestException as e:
---> 45 self.error = e.response.text
46 raise ValueError(e.response.text)
47
AttributeError: 'NoneType' object has no attribute 'text'
Tried solutions:
Attempt 1:
ctx = ClientContext(sharepoint_url).with_credentials(client_credentials)
request = RequestOptions("{0}/_api/web/".format(sharepoint_url))
request.verify = False
response = ctx.execute_request_direct(request) # <<< crashes here... example outdated?
json = json.loads(response.content)
web_title = json['d']['Title']
print("Web title: {0}".format(web_title))
Results in:
TypeError: sequence item 2: expected str instance, RequestOptions found
Attempt 2:
Based on this SO thread.
# If you're using a third-party module and want to disable the checks,
# here's a context manager that monkey patches `requests` and changes
# it so that verify=False is the default and suppresses the warning.
import warnings
import contextlib
import requests
from urllib3.exceptions import InsecureRequestWarning
old_merge_environment_settings = requests.Session.merge_environment_settings
#contextlib.contextmanager
def no_ssl_verification():
opened_adapters = set()
def merge_environment_settings(self, url, proxies, stream, verify, cert):
# Verification happens only once per connection so we need to close
# all the opened adapters once we're done. Otherwise, the effects of
# verify=False persist beyond the end of this context manager.
opened_adapters.add(self.get_adapter(url))
settings = old_merge_environment_settings(self, url, proxies, stream, verify, cert)
settings['verify'] = False
return settings
requests.Session.merge_environment_settings = merge_environment_settings
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore', InsecureRequestWarning)
yield
finally:
requests.Session.merge_environment_settings = old_merge_environment_settings
for adapter in opened_adapters:
try:
adapter.close()
except:
pass
And running that like:
with no_ssl_verification():
function_to_send_file_to_sharepoint()
Results in the same Max number of attempts error
Attempt 3:
Based on this github issue.
def disable_ssl(request):
request.verify = False # Disable certification verification
ctx.get_pending_request().beforeExecute += disable_ssl
web = ctx.web
ctx.load(web)
ctx.execute_query()
print(web.properties["Url"])
This code needs an update, since the thread was outdated. The current api provides pending_request and not get_pending_request(). With the fix applied, it results in the following:

We got it working.
The network configuration of databricks was configured with a firewall that blocked both these URLs which are both needed:
https://<tenant name>.sharepoint.com/
https://accounts.accesscontrol.windows.net
Then it worked flawlessly.
I didn't figure out why the error is shown like this:
AttributeError: 'NoneType' object has no attribute 'text'

Related

WinError 10060] A connection attempt failed for Flickr API

I'm new to python and is learning REST API from an online material, which provided an example code for Flickr API.
The code worked in the online video, however when I tried to run the same code on my own computer environment (Windows, Python IDE installed, browser is Chrome), it gave me a time-out error 10060.
I also checked against the Flickr documentation to make sure all the parameters input are correct.
Anyone knows why and how I can solve it? Thank you.
import json
flickr_key = 'xxxxxxxxx' #my own key was keyed in here
def get_flickr_data(tags_string):
baseurl = "https://api.flickr.com/services/rest/"
params_diction = {}
params_diction["api_key"] = flickr_key # from the above global variable
params_diction["tags"] = tags_string # must be a comma separated string to work correctly
params_diction["tag_mode"] = "all"
params_diction["method"] = "flickr.photos.search"
params_diction["per_page"] = 5
params_diction["media"] = "photos"
params_diction["format"] = "json"
params_diction["nojsoncallback"] = 1
flickr_resp = requests.get(baseurl, params = params_diction, timeout=1800)
# Useful for debugging: print the url! Uncomment the below line to do so.
print(flickr_resp.url) # Paste the result into the browser to check it out...
return flickr_resp.json()
result_river_mts = get_flickr_data("river,mountains")
# Some code to open up a few photos that are tagged with the mountains and river tags...
photos = result_river_mts['photos']['photo']
for photo in photos:
owner = photo['owner']
photo_id = photo['id']
url = 'https://www.flickr.com/photos/{}/{}'.format(owner, photo_id)
print(url)
# webbrowser.open(url)
The error is like this:
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
D:\Python\Python Install\lib\site-packages\urllib3\connection.py in _new_conn(self)
168 try:
--> 169 conn = connection.create_connection(
170 (self._dns_host, self.port), self.timeout, **extra_kw
D:\Python\Python Install\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
95 if err is not None:
---> 96 raise err
97
D:\Python\Python Install\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
85 sock.bind(source_address)
---> 86 sock.connect(sa)
87 return sock
TimeoutError: [WinError 10060]
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
D:\Python\Python Install\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
D:\Python\Python Install\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
381 try:
--> 382 self._validate_conn(conn)
383 except (SocketTimeout, BaseSSLError) as e:
D:\Python\Python Install\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
1009 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1010 conn.connect()
1011
D:\Python\Python Install\lib\site-packages\urllib3\connection.py in connect(self)
352 # Add certificate verification
--> 353 conn = self._new_conn()
354 hostname = self.host
D:\Python\Python Install\lib\site-packages\urllib3\connection.py in _new_conn(self)
180 except SocketError as e:
--> 181 raise NewConnectionError(
182 self, "Failed to establish a new connection: %s" % e
NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000000056C9430>: Failed to establish a new connection: [WinError 10060]
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
D:\Python\Python Install\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
D:\Python\Python Install\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
754
--> 755 retries = retries.increment(
756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
D:\Python\Python Install\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
573 if new_retry.is_exhausted():
--> 574 raise MaxRetryError(_pool, url, error or ResponseError(cause))
575
MaxRetryError: HTTPSConnectionPool(host='api.flickr.com', port=443): Max retries exceeded with url: /services/rest/?api_key='xxxxxxx' (#here masked by myself) &tags=river%2Cmountains&tag_mode=all&method=flickr.photos.search&per_page=5&media=photos&format=json&nojsoncallback=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000000056C9430>: Failed to establish a new connection: [WinError 10060] '))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-40-01e73ed2c8b4> in <module>
25 return flickr_resp.json()
26
---> 27 result_river_mts = get_flickr_data("river,mountains")
28
29 # Some code to open up a few photos that are tagged with the mountains and river tags...
<ipython-input-40-01e73ed2c8b4> in get_flickr_data(tags_string)
20 params_diction["format"] = "json"
21 params_diction["nojsoncallback"] = 1
---> 22 flickr_resp = requests.get(baseurl, params = params_diction, timeout=1800)
23 # Useful for debugging: print the url! Uncomment the below line to do so.
24 print(flickr_resp.url) # Paste the result into the browser to check it out...
D:\Python\Python Install\lib\site-packages\requests\api.py in get(url, params, **kwargs)
74
75 kwargs.setdefault('allow_redirects', True)
---> 76 return request('get', url, params=params, **kwargs)
77
78
D:\Python\Python Install\lib\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
D:\Python\Python Install\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
D:\Python\Python Install\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
D:\Python\Python Install\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='api.flickr.com', port=443): Max retries exceeded with url: /services/rest/?api_key=xxxxxxxx(#here masked by myself)&tags=river%2Cmountains&tag_mode=all&method=flickr.photos.search&per_page=5&media=photos&format=json&nojsoncallback=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000000056C9430>: Failed to establish a new connection: [WinError 10060] '))

Jupyter SSL Error(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1091)')

I'm trying to understand how python requests work using a jupyter notebook
The commands I'm running are:
> import requests
> requests.get('https://www.ecosia.org/', verify=False)
However, I'm getting the following error:
--------------------------------------------------------------------------- SSLError Traceback (most recent call last) ~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
695 if is_new_proxy_conn and http_tunnel_required:
--> 696 self._prepare_proxy(conn)
697
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\connectionpool.py in _prepare_proxy(self, conn)
963
--> 964 conn.connect()
965
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\connection.py in connect(self)
358 if self.tls_in_tls_required:
--> 359 conn = self._connect_tls_proxy(hostname, conn)
360 tls_in_tls = True
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\connection.py in _connect_tls_proxy(self, hostname, conn)
501 server_hostname=hostname,
--> 502 ssl_context=ssl_context,
503 )
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
428 ssl_sock = _ssl_wrap_socket_impl(
--> 429 sock, context, tls_in_tls, server_hostname=server_hostname
430 )
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\util\ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
471 if server_hostname:
--> 472 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
473 else:
~\anaconda3\envs\PythonMasterClass\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
422 context=self,
--> 423 session=session
424 )
~\anaconda3\envs\PythonMasterClass\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
869 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 870 self.do_handshake()
871 except (OSError, ValueError):
~\anaconda3\envs\PythonMasterClass\lib\ssl.py in do_handshake(self, block) 1138 self.settimeout(None)
-> 1139 self._sslobj.do_handshake() 1140 finally:
SSLError: [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1091)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last) ~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
755 retries = retries.increment(
--> 756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
757 )
~\anaconda3\envs\PythonMasterClass\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
572 if new_retry.is_exhausted():
--> 573 raise MaxRetryError(_pool, url, error or ResponseError(cause))
574
MaxRetryError: HTTPSConnectionPool(host='www.ecosia.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1091)')))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last) <ipython-input-3-fd617f87a929> in <module>
----> 1 requests.get('https://www.ecosia.org/', verify=False)
~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\api.py in get(url, params, **kwargs)
74
75 kwargs.setdefault('allow_redirects', True)
---> 76 return request('get', url, params=params, **kwargs)
77
78
~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
~\anaconda3\envs\PythonMasterClass\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
512 if isinstance(e.reason, _SSLError):
513 # This branch is for urllib3 v1.22 and later.
--> 514 raise SSLError(e, request=request)
515
516 raise ConnectionError(e, request=request)
SSLError: HTTPSConnectionPool(host='www.ecosia.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1091)')))
Do I need to setup some other port for the jupyter notebook to access the internet?

BoxOfficeMojo refusing connections from Selenium. Works fine with beautiful soup so it's not the actual connection

Anyone know what's going on? I'm about to throw my computer out the window. Beautiful soup is working fine with the same page, so I know it's not the connection. And I've tried putting WebDriverWait and time.sleep(10) in between every line - after the delay, the same error comes up.
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
chromedriver = "/usr/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
i = 2
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1')
wait = WebDriverWait(driver, 10)
wait
while i < 269:
wait
driver.find_element_by_xpath('//*[#id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
wait
get_movie_dict(driver.current_url)
wait
i += 1
wait
driver.back()
I receive the followed errors:
ConnectionRefusedError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
156 conn = connection.create_connection(
--> 157 (self._dns_host, self.port), self.timeout, **extra_kw
158 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
83 if err is not None:
---> 84 raise err
85
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
671 headers=headers,
--> 672 chunked=chunked,
673 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
375 try:
--> 376 self._validate_conn(conn)
377 except (SocketTimeout, BaseSSLError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
993 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 994 conn.connect()
995
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
299 # Add certificate verification
--> 300 conn = self._new_conn()
301 hostname = self.host
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
168 raise NewConnectionError(
--> 169 self, "Failed to establish a new connection: %s" % e
170 )
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
435 if new_retry.is_exhausted():
--> 436 raise MaxRetryError(_pool, url, error or ResponseError(cause))
437
MaxRetryError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-55-0dd26218976b> in <module>
9 driver.find_element_by_xpath('//*[#id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
10 wait
---> 11 get_movie_dict(driver.current_url)
12 wait
13 i += 1
<ipython-input-45-2533561becb9> in get_movie_dict(link)
19 wait = WebDriverWait(driver, 10)
20 wait
---> 21 response = requests.get(url)
22 wait
23 page = response.text
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
Edit: Added the function get_movie_dict:
def get_movie_dict(link):
'''
From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
collect
- title
- domestic gross
- runtime
- MPAA rating
- full release date
Return information as a dictionary.
'''
base_url = 'https://www.boxofficemojo.com'
#Create full url to scrape
url = base_url + link
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
headers = ['movie_title', 'domestic_total_gross',
'runtime_minutes', 'rating', 'release_date', 'budget']
#Get title
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()
#Get domestic gross
try:
raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
.find_all('span', class_='money')[0]
.text
)
except:
raw_domestic_total_gross = float("NaN")
if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
print('This is NaN')
domestic_total_gross = float("NaN")
else:
domestic_total_gross = money_to_int(raw_domestic_total_gross)
#Get runtime
raw_runtime = get_movie_value(soup,'Running')
if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
runtime = runtime_to_minutes(raw_runtime)
#Get rating
rating = get_movie_value(soup,'MPAA')
#Get release date
if '-' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
elif '(' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
else:
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
release_date = to_date(raw_release_date)
# Get budget alt
raw_budget = get_movie_value(soup,'Budget')
budget = money_to_int(raw_budget)
#Create movie dictionary and return
movie_dict = dict(zip(headers,[title,
domestic_total_gross,
runtime,
rating,
release_date,
budget]))
return movie_dict
The link that you are extracting from the page is "absolute" (it includes the scheme and hostname), when you add this to base_url you are getting a string that looks like https://www.boxofficemojo.comhttps://www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1
You should use urljoin to join the base url with the extracted url as it will handle both relative and absolute urls.
from urllib.parse import urljoin
url = urljoin(base_url, link)
I found out what the issue was. After removing this block from get_movie_dict the function worked properly:
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
get_movie_dict is a helper function. The line with response = requests.get(url) was attempting to send another, unrelated GET request inside the helper function, which was unnecessary because one had already been sent outside of it - this is what was causing the problem.
This is an example of why it's important to understand what each line of code is doing, before copying and pasting it into your own code.

H2OAutoML throws libgomp exception in train step

I run H2O on a docker image using Python 3.6.3 and H2O 3.26.0.3.
import h2o
from h2o.automl import H2OAutoML
h2o.init()
In this step, initialization is successful and it prints the following information.
H2O cluster uptime: 01 secs
H2O cluster timezone: Europe/Istanbul
H2O data parsing timezone: UTC
H2O cluster version: 3.26.0.3
H2O cluster version age: 9 days
H2O cluster name: H2O_from_python_96273_8m5wyj
H2O cluster total nodes: 1
H2O cluster free memory: 26.67 Gb
H2O cluster total cores: 72
H2O cluster allowed cores: 72
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.6.3 final
Now, I will run AutoML but it is problematic.
hf = h2o.H2OFrame(x_train)
aml = H2OAutoML(max_runtime_secs=600)
aml.train(x = list(df.columns[:-1]), y = df.columns[-1], training_frame = hf)
I have the following error
ConnectionResetError Traceback (most recent call
last) ~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py
in urlopen(self, method, url, body, headers, retries, redirect,
assert_same_host, timeout, pool_timeout, release_conn, chunked,
body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
_make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/.local/lib/python3.6/site-packages/urllib3/packages/six.py in
raise_from(value, from_value)
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
_make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
getresponse(self) 1330 try:
-> 1331 response.begin() 1332 except ConnectionError:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
_read_status(self)
257 def _read_status(self):
--> 258 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
259 if len(line) > _MAXLINE:
/opt/rh/rh-python36/root/usr/lib64/python3.6/socket.py in
readinto(self, b)
585 try:
--> 586 return self._sock.recv_into(b)
587 except timeout:
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call
last) ~/.local/lib/python3.6/site-packages/requests/adapters.py in
send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
urlopen(self, method, url, body, headers, retries, redirect,
assert_same_host, timeout, pool_timeout, release_conn, chunked,
body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~/.local/lib/python3.6/site-packages/urllib3/util/retry.py in
increment(self, method, url, response, error, _pool, _stacktrace)
367 if read is False or not self._is_method_retryable(method):
--> 368 raise six.reraise(type(error), error, _stacktrace)
369 elif read is not None:
~/.local/lib/python3.6/site-packages/urllib3/packages/six.py in
reraise(tp, value, tb)
684 if value.traceback is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
urlopen(self, method, url, body, headers, retries, redirect,
assert_same_host, timeout, pool_timeout, release_conn, chunked,
body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
_make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/.local/lib/python3.6/site-packages/urllib3/packages/six.py in
raise_from(value, from_value)
~/.local/lib/python3.6/site-packages/urllib3/connectionpool.py in
_make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
getresponse(self) 1330 try:
-> 1331 response.begin() 1332 except ConnectionError:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/opt/rh/rh-python36/root/usr/lib64/python3.6/http/client.py in
_read_status(self)
257 def _read_status(self):
--> 258 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
259 if len(line) > _MAXLINE:
/opt/rh/rh-python36/root/usr/lib64/python3.6/socket.py in
readinto(self, b)
585 try:
--> 586 return self._sock.recv_into(b)
587 except timeout:
ProtocolError: ('Connection aborted.', ConnectionResetError(104,
'Connection reset by peer'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call
last) ~/.local/lib/python3.6/site-packages/h2o/backend/connection.py
in request(self, endpoint, data, json, filename, save_to)
404 headers=headers, timeout=self._timeout, stream=stream,
--> 405 auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)
406 self._log_end_transaction(start_time, resp)
~/.local/lib/python3.6/site-packages/requests/api.py in
request(method, url, **kwargs)
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
~/.local/lib/python3.6/site-packages/requests/sessions.py in
request(self, method, url, params, data, headers, cookies, files,
auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert,
json)
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
~/.local/lib/python3.6/site-packages/requests/sessions.py in
send(self, request, **kwargs)
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
~/.local/lib/python3.6/site-packages/requests/adapters.py in
send(self, request, stream, timeout, verify, cert, proxies)
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
ConnectionError: ('Connection aborted.', ConnectionResetError(104,
'Connection reset by peer'))
During handling of the above exception, another exception occurred:
H2OConnectionError Traceback (most recent call
last) in
----> 1 aml.train(x = list(df.columns[:-1]), y = df.columns[-1], training_frame = hf)
~/.local/lib/python3.6/site-packages/h2o/automl/autoh2o.py in
train(self, x, y, training_frame, fold_column, weights_column,
validation_frame, leaderboard_frame, blending_frame)
443 poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={})
444 try:
--> 445 self._job.poll(poll_updates=poll_updates)
446 finally:
447 poll_updates(self._job, 1)
~/.local/lib/python3.6/site-packages/h2o/job.py in poll(self,
poll_updates)
55 pb = ProgressBar(title=self._job_type + " progress", hidden=hidden)
56 if poll_updates:
---> 57 pb.execute(self._refresh_job_status, print_verbose_info=ft.partial(poll_updates, self))
58 else:
59 pb.execute(self._refresh_job_status)
~/.local/lib/python3.6/site-packages/h2o/utils/progressbar.py in
execute(self, progress_fn, print_verbose_info)
169 # Query the progress level, but only if it's time already
170 if self._next_poll_time <= now:
--> 171 res = progress_fn() # may raise StopIteration
172 assert_is_type(res, (numeric, numeric), numeric)
173 if not isinstance(res, tuple):
~/.local/lib/python3.6/site-packages/h2o/job.py in
_refresh_job_status(self)
92 def _refresh_job_status(self):
93 if self._poll_count <= 0: raise StopIteration("")
---> 94 jobs = h2o.api("GET /3/Jobs/%s" % self.job_key)
95 self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0]
96 self.status = self.job["status"]
~/.local/lib/python3.6/site-packages/h2o/h2o.py in api(endpoint, data,
json, filename, save_to)
102 # type checks are performed in H2OConnection class
103 _check_connection()
--> 104 return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to)
105
106
~/.local/lib/python3.6/site-packages/h2o/backend/connection.py in
request(self, endpoint, data, json, filename, save_to)
413 else:
414 self._log_end_exception(e)
--> 415 raise H2OConnectionError("Unexpected HTTP error: %s" % e)
416 except requests.exceptions.Timeout as e:
417 self._log_end_exception(e)
H2OConnectionError: Unexpected HTTP error: ('Connection aborted.',
ConnectionResetError(104, 'Connection reset by peer'))
I suspect that proxy might be the reason of this exception. When I add the proxy information to the path, then exception message would be "HTTP 500 INKApi Error"
import os
os.environ['http_proxy']= ...
os.environ['https_proxy']= ...
JVM stdout log file dumps the following exception.
[thread 140335217821440 also had an error][thread 140335320467200 also
had an error] [thread 140335207294720 also had an error]
[thread 140335316256512 also had an error]# A fatal error has been detected by the Java Runtime Environment:
[thread 140335202031360 also had an error]
SIGSEGV (0xb) at pc=0x00007fa3276cdb8d, pid=51986, tid=0x00007fa2575f5700
JRE version: OpenJDK Runtime Environment (8.0_212-b04) (build 1.8.0_212-b04)
Java VM: OpenJDK 64-Bit Server VM (25.212-b04 mixed mode linux-amd64 compressed oops)
Problematic frame:
[thread 140335231506176 also had an error] C [libc.so.6+0x39b8d][thread 140335341520640 also had an error]
JVM stderr log file contains interesting logs
libgomp: Thread creation failed: Resource temporarily unavailable
* Error in `/usr/bin/java': free(): corrupted unsorted chunks: 0x00007efe342f0240 *
libgomp: Thread creation failed: Resource temporarily unavailable
Funny but It runs successfully when I run same code on my local machine. I suspect that it might be because of docker configuration.
I spent hours on this problem but I can resolve it immediately when I post this question. It would be a typical rubber duck programming.
It seems that the engine consumes all resources of the server and exceeded its limits. This is the reason of "Thread creation failed: Resource temporarily unavailable" message.
Limiting memory and number of threads solves this problem.
h2o.init(ip="127.0.0.1",max_mem_size_GB = 40, nthreads = 2)

How to fix 'Connection aborted.' error in Python with BeautifulSoup

I had been running this code daily for weeks with no error. This morning, it ran the for loop over 100 times properly, then gave a connection issue. Each time I have tried to run it since, it will run anywhere from 5 to 130 times, but always gives the connection error before completing.
I am still getting status codes of 200. I've seen some posts referencing 'memory leak' issues in Python, but I'm not sure how to figure out if that's the problem here. It's also strange because it had been working fine until today.
I have similar code for other pages on the same site that still runs correctly all the way through.
Here is the code:
import requests
from bs4 import BeautifulSoup
updates = []
print(f'Getting {total_timebanks} timebank details... ')
for timebank in range(len(timebanks)):
url = f"http://community.timebanks.org/{timebanks['slug'][timebank]}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
update = {}
update['members'] = soup.find('div', {'class': 'views-field-field-num-users-value'}).span.text.strip().replace(',', '')
updates.append(update)
time.sleep(1)
And here is the full error message:
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in getresponse(self)
1330 try:
-> 1331 response.begin()
1332 except ConnectionError:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in _read_status(self)
265 # sending a valid response.
--> 266 raise RemoteDisconnected("Remote end closed connection without"
267 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
366 if read is False or not self._is_method_retryable(method):
--> 367 raise six.reraise(type(error), error, _stacktrace)
368 elif read is not None:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
684 if value.__traceback__ is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in getresponse(self)
1330 try:
-> 1331 response.begin()
1332 except ConnectionError:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in _read_status(self)
265 # sending a valid response.
--> 266 raise RemoteDisconnected("Remote end closed connection without"
267 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-17-31257fee2c23> in <module>
5 for timebank in range(len(timebanks)):
6 url = f"http://community.timebanks.org/{timebanks['slug'][timebank]}"
----> 7 res = requests.get(url, headers=headers)
8 soup = BeautifulSoup(res.content, 'lxml')
9
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
The problem seems to have resolved itself. With no changes to the code, it is back to running as expected this morning.
I don't have much insight as to why I had connection errors yesterday, but it does seem to have been an issue with the site, not the code.
Thanks for the responses! For reference, I had also tried increasing sleep timer to 30, but that did not resolve the problem yesterday.

Categories