Related
We are working in a Python notebook on Databricks and want to send a file to a SharePoint site.
To achieve this, we obtained a client_id and client_secret from
https://<SP_domain>.sharepoint.com/sites/<my_site_name>/_layouts/15/appregnew.aspx
Locally, I can successfully send a file to SharePoint using these secrets. On DataBricks, I receive SSL Errors.
Normally, something like verify=false within the request can be provided, ignoring SSL certificate checks (if that is the actual issue). But this does not seem to be supported in the Python package that I am using: Office365-REST-Python-Client
The message of the errors that are received without any attempt to circumvent the issue.
SSLError: HTTPSConnectionPool(host='<SP_domain>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<my sites name>(Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Reproducible code
sharepoint_url = 'https://....sharepoint.com/sites/...'
client_credentials = ClientCredential(client_id=, client_secret=)
ctx = ClientContext(sharepoint_url).with_credentials(client_credentials)
web = ctx.web
ctx.load(web)
ctx.execute_query() # <<< Crashes here
print(web.properties["Url"])
Results in:
AttributeError: 'NoneType' object has no attribute 'text'
Actual (not the last) error states:
MaxRetryError: HTTPSConnectionPool(host='nsdigitaal.sharepoint.com', port=443): Max retries exceeded with url: /sites/Team-Camerainspectie (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Full stack (sorry in advance :P)
---------------------------------------------------------------------------
SSLEOFError Traceback (most recent call last)
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
381 try:
--> 382 self._validate_conn(conn)
383 except (SocketTimeout, BaseSSLError) as e:
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
1009 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1010 conn.connect()
1011
/databricks/python/lib/python3.9/site-packages/urllib3/connection.py in connect(self)
415
--> 416 self.sock = ssl_wrap_socket(
417 sock=conn,
/databricks/python/lib/python3.9/site-packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
/databricks/python/lib/python3.9/site-packages/urllib3/util/ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
/usr/lib/python3.9/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
499 # ctx._wrap_socket()
--> 500 return self.sslsocket_class._create(
501 sock=sock,
/usr/lib/python3.9/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1039 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1040 self.do_handshake()
1041 except (OSError, ValueError):
/usr/lib/python3.9/ssl.py in do_handshake(self, block)
1308 self.settimeout(None)
-> 1309 self._sslobj.do_handshake()
1310 finally:
SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1129)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
/databricks/python/lib/python3.9/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
/databricks/python/lib/python3.9/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
754
--> 755 retries = retries.increment(
756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
/databricks/python/lib/python3.9/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
573 if new_retry.is_exhausted():
--> 574 raise MaxRetryError(_pool, url, error or ResponseError(cause))
575
MaxRetryError: HTTPSConnectionPool(host='<tenant name>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<site name> (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in get_app_only_access_token(self)
40 try:
---> 41 realm = self._get_realm_from_target_url()
42 url_info = urlparse(self.url)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in _get_realm_from_target_url(self)
69 def _get_realm_from_target_url(self):
---> 70 response = requests.head(url=self.url, headers={'Authorization': 'Bearer'})
71 return self.process_realm_response(response)
/databricks/python/lib/python3.9/site-packages/requests/api.py in head(url, **kwargs)
101 kwargs.setdefault('allow_redirects', False)
--> 102 return request('head', url, **kwargs)
103
/databricks/python/lib/python3.9/site-packages/requests/api.py in request(method, url, **kwargs)
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
/databricks/python/lib/python3.9/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
/databricks/python/lib/python3.9/site-packages/requests/sessions.py in send(self, request, **kwargs)
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
/databricks/python/lib/python3.9/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
513 # This branch is for urllib3 v1.22 and later.
--> 514 raise SSLError(e, request=request)
515
SSLError: HTTPSConnectionPool(host='<tenant name>.sharepoint.com', port=443): Max retries exceeded with url: /sites/<site name> (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<command-4083654498839573> in <cell line: 14>()
12 web = ctx.web
13 ctx.load(web)
---> 14 ctx.execute_query()
15 print(web.properties["Url"])
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_runtime_context.py in execute_query(self)
145 def execute_query(self):
146 """Submit request(s) to the server"""
--> 147 self.pending_request().execute_query()
148
149 def add_query(self, query):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_request.py in execute_query(self)
72 request = self.build_request(qry)
73 self.beforeExecute.notify(request)
---> 74 response = self.execute_request_direct(request)
75 response.raise_for_status()
76 self.process_response(response)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/odata/request.py in execute_request_direct(self, request)
34 """
35 self._build_specific_request(request)
---> 36 return super(ODataRequest, self).execute_request_direct(request)
37
38 def build_request(self, query):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/client_request.py in execute_request_direct(self, request)
84 :type request: office365.runtime.http.request_options.RequestOptions
85 """
---> 86 self.context.authenticate_request(request)
87 if request.method == HttpMethod.Post:
88 if request.is_bytes or request.is_file:
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/sharepoint/client_context.py in authenticate_request(self, request)
238
239 def authenticate_request(self, request):
--> 240 self.authentication_context.authenticate_request(request)
241
242 def _build_modification_query(self, request):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/authentication_context.py in authenticate_request(self, request)
95 :type request: office365.runtime.http.request_options.RequestOptions
96 """
---> 97 self._provider.authenticate_request(request)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in authenticate_request(self, request)
29 :type request: office365.runtime.http.request_options.RequestOptions
30 """
---> 31 self.ensure_app_only_access_token()
32 request.set_header('Authorization', self._get_authorization_header())
33
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in ensure_app_only_access_token(self)
34 def ensure_app_only_access_token(self):
35 if self._cached_token is None:
---> 36 self._cached_token = self.get_app_only_access_token()
37 return self._cached_token and self._cached_token.is_valid
38
/local_disk0/.ephemeral_nfs/envs/pythonEnv-e6edc2d5-a811-4e43-a0ea-d29958d03122/lib/python3.9/site-packages/office365/runtime/auth/providers/acs_token_provider.py in get_app_only_access_token(self)
43 return self._get_app_only_access_token(url_info.hostname, realm)
44 except requests.exceptions.RequestException as e:
---> 45 self.error = e.response.text
46 raise ValueError(e.response.text)
47
AttributeError: 'NoneType' object has no attribute 'text'
Tried solutions:
Attempt 1:
ctx = ClientContext(sharepoint_url).with_credentials(client_credentials)
request = RequestOptions("{0}/_api/web/".format(sharepoint_url))
request.verify = False
response = ctx.execute_request_direct(request) # <<< crashes here... example outdated?
json = json.loads(response.content)
web_title = json['d']['Title']
print("Web title: {0}".format(web_title))
Results in:
TypeError: sequence item 2: expected str instance, RequestOptions found
Attempt 2:
Based on this SO thread.
# If you're using a third-party module and want to disable the checks,
# here's a context manager that monkey patches `requests` and changes
# it so that verify=False is the default and suppresses the warning.
import warnings
import contextlib
import requests
from urllib3.exceptions import InsecureRequestWarning
old_merge_environment_settings = requests.Session.merge_environment_settings
#contextlib.contextmanager
def no_ssl_verification():
opened_adapters = set()
def merge_environment_settings(self, url, proxies, stream, verify, cert):
# Verification happens only once per connection so we need to close
# all the opened adapters once we're done. Otherwise, the effects of
# verify=False persist beyond the end of this context manager.
opened_adapters.add(self.get_adapter(url))
settings = old_merge_environment_settings(self, url, proxies, stream, verify, cert)
settings['verify'] = False
return settings
requests.Session.merge_environment_settings = merge_environment_settings
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore', InsecureRequestWarning)
yield
finally:
requests.Session.merge_environment_settings = old_merge_environment_settings
for adapter in opened_adapters:
try:
adapter.close()
except:
pass
And running that like:
with no_ssl_verification():
function_to_send_file_to_sharepoint()
Results in the same Max number of attempts error
Attempt 3:
Based on this github issue.
def disable_ssl(request):
request.verify = False # Disable certification verification
ctx.get_pending_request().beforeExecute += disable_ssl
web = ctx.web
ctx.load(web)
ctx.execute_query()
print(web.properties["Url"])
This code needs an update, since the thread was outdated. The current api provides pending_request and not get_pending_request(). With the fix applied, it results in the following:
We got it working.
The network configuration of databricks was configured with a firewall that blocked both these URLs which are both needed:
https://<tenant name>.sharepoint.com/
https://accounts.accesscontrol.windows.net
Then it worked flawlessly.
I didn't figure out why the error is shown like this:
AttributeError: 'NoneType' object has no attribute 'text'
I am trying to translate a text to English, and I am using googletrans package version = 3.0.0, Python 3.8.5. But I got this error. I am using a company laptop and I don't have admin rights on it Can anyone help me with this?
This is the code that I used just for a test:
from googletrans import Translator, constants
translator = Translator()
translation = translator.translate("Hola Mundo")
And this is the error thrown:
---------------------------------------------------------------------------
ConnectError Traceback (most recent call last)
<ipython-input-54-93d54b4e73a5> in <module>
1 from googletrans import Translator, constants
2 translator = Translator()
----> 3 translation = translator.translate("Hola Mundo")
~\Anaconda3\lib\site-packages\googletrans\client.py in translate(self, text, dest, src, **kwargs)
180
181 origin = text
--> 182 data = self._translate(text, dest, src, kwargs)
183
184 # this code will be updated when the format is changed.
~\Anaconda3\lib\site-packages\googletrans\client.py in _translate(self, text, dest, src, override)
76
77 def _translate(self, text, dest, src, override):
---> 78 token = self.token_acquirer.do(text)
79 params = utils.build_params(query=text, src=src, dest=dest,
80 token=token, override=override)
~\Anaconda3\lib\site-packages\googletrans\gtoken.py in do(self, text)
192
193 def do(self, text):
--> 194 self._update()
195 tk = self.acquire(text)
196 return tk
~\Anaconda3\lib\site-packages\googletrans\gtoken.py in _update(self)
52 return
53
---> 54 r = self.client.get(self.host)
55
56 raw_tkk = self.RE_TKK.search(r.text)
~\Anaconda3\lib\site-packages\httpx\_client.py in get(self, url, params, headers, cookies, auth, allow_redirects, timeout)
753 timeout: typing.Union[TimeoutTypes, UnsetType] = UNSET,
754 ) -> Response:
--> 755 return self.request(
756 "GET",
757 url,
~\Anaconda3\lib\site-packages\httpx\_client.py in request(self, method, url, data, files, json, params, headers, cookies, auth, allow_redirects, timeout)
598 cookies=cookies,
599 )
--> 600 return self.send(
601 request, auth=auth, allow_redirects=allow_redirects, timeout=timeout,
602 )
~\Anaconda3\lib\site-packages\httpx\_client.py in send(self, request, stream, auth, allow_redirects, timeout)
618 auth = self.build_auth(request, auth)
619
--> 620 response = self.send_handling_redirects(
621 request, auth=auth, timeout=timeout, allow_redirects=allow_redirects,
622 )
~\Anaconda3\lib\site-packages\httpx\_client.py in send_handling_redirects(self, request, auth, timeout, allow_redirects, history)
645 raise TooManyRedirects()
646
--> 647 response = self.send_handling_auth(
648 request, auth=auth, timeout=timeout, history=history
649 )
~\Anaconda3\lib\site-packages\httpx\_client.py in send_handling_auth(self, request, history, auth, timeout)
682 request = next(auth_flow)
683 while True:
--> 684 response = self.send_single_request(request, timeout)
685 if auth.requires_response_body:
686 response.read()
~\Anaconda3\lib\site-packages\httpx\_client.py in send_single_request(self, request, timeout)
712 headers,
713 stream,
--> 714 ) = transport.request(
715 request.method.encode(),
716 request.url.raw,
~\Anaconda3\lib\site-packages\httpcore\_sync\connection_pool.py in request(self, method, url, headers, stream, timeout)
150
151 try:
--> 152 response = connection.request(
153 method, url, headers=headers, stream=stream, timeout=timeout
154 )
~\Anaconda3\lib\site-packages\httpcore\_sync\connection.py in request(self, method, url, headers, stream, timeout)
63 "open_socket origin=%r timeout=%r", self.origin, timeout
64 )
---> 65 self.socket = self._open_socket(timeout)
66 self._create_connection(self.socket)
67 elif self.state in (ConnectionState.READY, ConnectionState.IDLE):
~\Anaconda3\lib\site-packages\httpcore\_sync\connection.py in _open_socket(self, timeout)
83 ssl_context = self.ssl_context if scheme == b"https" else None
84 try:
---> 85 return self.backend.open_tcp_stream(
86 hostname, port, ssl_context, timeout
87 )
~\Anaconda3\lib\site-packages\httpcore\_backends\sync.py in open_tcp_stream(self, hostname, port, ssl_context, timeout)
137 sock, server_hostname=hostname.decode("ascii")
138 )
--> 139 return SyncSocketStream(sock=sock)
140
141 def create_lock(self) -> SyncLock:
~\Anaconda3\lib\contextlib.py in __exit__(self, type, value, traceback)
129 value = type()
130 try:
--> 131 self.gen.throw(type, value, traceback)
132 except StopIteration as exc:
133 # Suppress StopIteration *unless* it's the same exception that
~\Anaconda3\lib\site-packages\httpcore\_exceptions.py in map_exceptions(map)
10 for from_exc, to_exc in map.items():
11 if isinstance(exc, from_exc):
---> 12 raise to_exc(exc) from None
13 raise
14
ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123) ```
I used the same code on another laptop and I got another Error. I want to translate a text which I don't know in which language is(different files in different languages) to English and I am not sure this is the best package to use for this so even some other packages that you are using and you are sure that works would be great. Thanks
I won't recommend you to use this library.As it is stated in documentation for this library:
Due to limitations of the web version of google translate, this API does not guarantee that the library would work properly at all times. (so please use this library if you don’t care about stability.)
Don't you consider using official Google Python client for Google Translate API?https://cloud.google.com/translate/docs/reference/libraries/v2/python
Anyone know what's going on? I'm about to throw my computer out the window. Beautiful soup is working fine with the same page, so I know it's not the connection. And I've tried putting WebDriverWait and time.sleep(10) in between every line - after the delay, the same error comes up.
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
chromedriver = "/usr/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
i = 2
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1')
wait = WebDriverWait(driver, 10)
wait
while i < 269:
wait
driver.find_element_by_xpath('//*[#id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
wait
get_movie_dict(driver.current_url)
wait
i += 1
wait
driver.back()
I receive the followed errors:
ConnectionRefusedError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
156 conn = connection.create_connection(
--> 157 (self._dns_host, self.port), self.timeout, **extra_kw
158 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
83 if err is not None:
---> 84 raise err
85
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
671 headers=headers,
--> 672 chunked=chunked,
673 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
375 try:
--> 376 self._validate_conn(conn)
377 except (SocketTimeout, BaseSSLError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
993 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 994 conn.connect()
995
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
299 # Add certificate verification
--> 300 conn = self._new_conn()
301 hostname = self.host
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
168 raise NewConnectionError(
--> 169 self, "Failed to establish a new connection: %s" % e
170 )
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
435 if new_retry.is_exhausted():
--> 436 raise MaxRetryError(_pool, url, error or ResponseError(cause))
437
MaxRetryError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-55-0dd26218976b> in <module>
9 driver.find_element_by_xpath('//*[#id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
10 wait
---> 11 get_movie_dict(driver.current_url)
12 wait
13 i += 1
<ipython-input-45-2533561becb9> in get_movie_dict(link)
19 wait = WebDriverWait(driver, 10)
20 wait
---> 21 response = requests.get(url)
22 wait
23 page = response.text
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
Edit: Added the function get_movie_dict:
def get_movie_dict(link):
'''
From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
collect
- title
- domestic gross
- runtime
- MPAA rating
- full release date
Return information as a dictionary.
'''
base_url = 'https://www.boxofficemojo.com'
#Create full url to scrape
url = base_url + link
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
headers = ['movie_title', 'domestic_total_gross',
'runtime_minutes', 'rating', 'release_date', 'budget']
#Get title
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()
#Get domestic gross
try:
raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
.find_all('span', class_='money')[0]
.text
)
except:
raw_domestic_total_gross = float("NaN")
if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
print('This is NaN')
domestic_total_gross = float("NaN")
else:
domestic_total_gross = money_to_int(raw_domestic_total_gross)
#Get runtime
raw_runtime = get_movie_value(soup,'Running')
if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
runtime = runtime_to_minutes(raw_runtime)
#Get rating
rating = get_movie_value(soup,'MPAA')
#Get release date
if '-' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
elif '(' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
else:
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
release_date = to_date(raw_release_date)
# Get budget alt
raw_budget = get_movie_value(soup,'Budget')
budget = money_to_int(raw_budget)
#Create movie dictionary and return
movie_dict = dict(zip(headers,[title,
domestic_total_gross,
runtime,
rating,
release_date,
budget]))
return movie_dict
The link that you are extracting from the page is "absolute" (it includes the scheme and hostname), when you add this to base_url you are getting a string that looks like https://www.boxofficemojo.comhttps://www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1
You should use urljoin to join the base url with the extracted url as it will handle both relative and absolute urls.
from urllib.parse import urljoin
url = urljoin(base_url, link)
I found out what the issue was. After removing this block from get_movie_dict the function worked properly:
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
get_movie_dict is a helper function. The line with response = requests.get(url) was attempting to send another, unrelated GET request inside the helper function, which was unnecessary because one had already been sent outside of it - this is what was causing the problem.
This is an example of why it's important to understand what each line of code is doing, before copying and pasting it into your own code.
I'm trying to automate downloads of specific part prices and quantities from Octopart using Python. I'm able to convert the csv file with the specific part numbers I want to look up into line items and queries, but keep getting an HTTPError message when I try to send the queries to REST API for part matching. I entered in my apikey but since it still doesn't connect, I'm wondering if I wrote the url incorrectly. Any guidance would be appreciated.
Code:
# Send queries to REST API for part matching.
import json
import urllib.parse
import urllib.request
results = []
for i in range(0, len(queries), 20):
# Batch queries in groups of 20, query limit of
# parts match endpoint
batched_queries = queries[i: i + 20]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(batched_queries))
url += '&apikey=eb49732b'
data = urllib.request.urlopen(url)
response = json.loads(data)
# Record results for analysis
results.extend(response['results'])
Error:
HTTPError Traceback (most recent call last)
<ipython-input-43-cf5776fdc754> in <module>()
14 url = 'http://octopart.com/api/v3/parts/match?queries=%s' % urllib.parse.quote(json.dumps(batched_queries))
15 url += '&apikey=eb49732b'
---> 16 data = urllib.request.urlopen(url)
17 response = json.loads(data)
18
~\Documents\Software\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Documents\Software\Anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\Documents\Software\Anaconda\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Documents\Software\Anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\Documents\Software\Anaconda\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
Thank you for your help!
check your API key or contact them and ask about permissions
When I tried with curl sample using your key, it also fails with 403
$ curl -G https://octopart.com/api/v3/parts/match -d queries="[{\"mpn\":\"SN74S74N\"}]" \
-d apikey=eb49732b \
-d pretty_print=true
{
"__class__": "ClientErrorResponse",
"message": "Forbidden request"
}
However with EXAMPLE_KEY the query above succeeds
Try the following code with your api key... if it doesn't work then your key is probably invalidated.
import json
import urllib
import urllib.parse
import urllib.request
queries = [
{'mpn': 'SN74S74N',
'reference': 'line1'},
{'sku': '67K1122',
'reference': 'line2'},
{'mpn_or_sku': 'SN74S74N',
'reference': 'line3'},
{'brand': 'Texas Instruments',
'mpn': 'SN74S74N',
'reference': 'line4'}
]
url = 'http://octopart.com/api/v3/parts/match?queries=%s' \
% urllib.parse.quote(json.dumps(queries))
url += "&include[]=specs"
# NOTE: Use your API key here (https://octopart.com/api/register)
url += '&apikey=<REPLACEME>'
data = urllib.request.urlopen(url).read()
response = json.loads(data)
# print request time (in milliseconds)
print("Response time: %s msec\n" % response['msec'])
I am still new to python and can't figure out how to handle this error and what to do with it to avoid it even after trying to understand the different methods of the Requests module and reading out there.
Here's the simple request I use where line loops through a text file with the different URL I'm trying to access and d a list of dictionary containing the many URLs I'm using as proxies.
import requests
import collections
# [...]
d = collections.deque(proxies)
with requests.session() as r:
d.rotate(-1)
page = r.get(line.rstrip(), proxies=d[0])
It works perfectly until one of the proxies from the list timeout for some reason and force the script to raise this error:
ProxyError Traceback (most recent call last)
C:\Python27\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
C:\Users\Christopher Fargere\desktop\python\quick_scraper.py in <module>()
72 with requests.session() as r:
73 d.rotate(-1)
---> 74 page = r.get(line.rstrip(), proxies=d[0])
75 print d[0]
76 print page.status_code
C:\Python27\lib\site-packages\requests\sessions.pyc in get(self, url, **kwargs)
393
394 kwargs.setdefault('allow_redirects', True)
--> 395 return self.request('GET', url, **kwargs)
396
397 def options(self, url, **kwargs):
C:\Python27\lib\site-packages\requests\sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert)
381 'allow_redirects': allow_redirects,
382 }
--> 383 resp = self.send(prep, **send_kwargs)
384
385 return resp
C:\Python27\lib\site-packages\requests\sessions.pyc in send(self, request, **kwargs)
484 start = datetime.utcnow()
485 # Send the request
--> 486 r = adapter.send(request, **kwargs)
487 # Total elapsed time of the request (approximately)
488 r.elapsed = datetime.utcnow() - start
C:\Python27\lib\site-packages\requests\adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
379
380 except _ProxyError as e:
--> 381 raise ProxyError(e)
382
383 except (_SSLError, _HTTPError) as e:
ProxyError: Cannot connect to proxy. Socket error: [Errno 11001] getaddrinfo failed.
I would love to implement an IF condition when an error is raised that pops out the proxy out of the d list and retry the same URL. I'm sure its very simple but can't understand how the errors are raised in Python. :(
To catch an exception, use exception handling; catch the ProxyError thrown:
from requests.exceptions import ProxyError
with requests.session() as r:
page = None
for _ in range(len(d)):
d.rotate(-1)
try:
page = r.get(line.rstrip(), proxies=d[0])
except ProxyError:
# ignore proxy exception, move to next proxy
pass
else:
# success, break loop
break
if page is None:
# none of the proxies worked
raise ProxyError
This tries, at most, all your proxies in d, one by one. If none of them worked, we raise the ProxyError again, as you probably want to know that all your proxies failed at that time.