Web scraping challenge - python

I'm learning how to use BeautifulSoup on a random challenge bracket (as an exercise because I would like to start scraping challenge brackets).
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r=requests.get('https://smashchile.challonge.com/ss1')
webpage= bs(r.content)
But I got an error (see at the bottom).
This is my first time web scraping and I would like to know more about the legal restrictions about it.
Terms of services says the following: (Link)
Use any robot, spider, scraper, or other automated means to access this Website or services for any purpose without our express written permission; however, this provision shall not apply to the indexing or updating of search engines.
Thanks in advance ;)
RemoteDisconnected Traceback (most recent call last)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
705 headers=headers,
--> 706 chunked=chunked,
707 )
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
/usr/lib/python3.7/http/client.py in getresponse(self)
1335 try:
-> 1336 response.begin()
1337 except ConnectionError:
/usr/lib/python3.7/http/client.py in begin(self)
305 while True:
--> 306 version, status, reason = self._read_status()
307 if status != CONTINUE:
/usr/lib/python3.7/http/client.py in _read_status(self)
274 # sending a valid response.
--> 275 raise RemoteDisconnected("Remote end closed connection without"
276 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
755 retries = retries.increment(
--> 756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
757 )
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
530 if read is False or not self._is_method_retryable(method):
--> 531 raise six.reraise(type(error), error, _stacktrace)
532 elif read is not None:
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
733 if value.__traceback__ is not tb:
--> 734 raise value.with_traceback(tb)
735 raise value
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
705 headers=headers,
--> 706 chunked=chunked,
707 )
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
/usr/lib/python3.7/http/client.py in getresponse(self)
1335 try:
-> 1336 response.begin()
1337 except ConnectionError:
/usr/lib/python3.7/http/client.py in begin(self)
305 while True:
--> 306 version, status, reason = self._read_status()
307 if status != CONTINUE:
/usr/lib/python3.7/http/client.py in _read_status(self)
274 # sending a valid response.
--> 275 raise RemoteDisconnected("Remote end closed connection without"
276 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-1-49ffef2d4435> in <module>
3 import pandas as pd
4
----> 5 r=requests.get('https://smashchile.challonge.com/ss1')
6 webpage= bs(r.content)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
74
75 kwargs.setdefault('allow_redirects', True)
---> 76 return request('get', url, params=params, **kwargs)
77
78
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

For that challenge to get solved, what you need is headers. Otherwise, the server (correctly) thinks you're a bot and refuses the connection.
For example:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36",
}
webpage = BeautifulSoup(
requests.get('https://smashchile.challonge.com/ss1', headers=headers).content,
"html.parser",
).select_one(".user-profile-block>.details>*.-nopad")
print(webpage.getText(strip=True))
Output:
KLG | DDC | Keen
A side note: That website you're scraping is mostly dynamic, so you won't get much out of the content as b4 simply won't see it.

Related

Web Scraping / Zomato Web Scraping with BeautifulSoup

I tried web scraping referencing https://datascienceplus.com/zomato-web-scraping-with-beautifulsoup-in-python/
Just copied & pasted code on the site, but getting an error on the 2nd step.
import requests
from bs4 import BeautifulSoup
#Used headers/agent because the request was timed out and asking for an agent.
#Using following code we can fake the agent.
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
Below is the error code.
I used Jupiter Notebook. Do you know why I'm getting this?
I'm totally new to this, and don't even fully understand what these variables (headers, response) do.
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
~/anaconda3/lib/python3.7/http/client.py in getresponse(self)
1320 try:
-> 1321 response.begin()
1322 except ConnectionError:
~/anaconda3/lib/python3.7/http/client.py in begin(self)
295 while True:
--> 296 version, status, reason = self._read_status()
297 if status != CONTINUE:
~/anaconda3/lib/python3.7/http/client.py in _read_status(self)
264 # sending a valid response.
--> 265 raise RemoteDisconnected("Remote end closed connection without"
266 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
367 if read is False or not self._is_method_retryable(method):
--> 368 raise six.reraise(type(error), error, _stacktrace)
369 elif read is not None:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
684 if value.__traceback__ is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
~/anaconda3/lib/python3.7/http/client.py in getresponse(self)
1320 try:
-> 1321 response.begin()
1322 except ConnectionError:
~/anaconda3/lib/python3.7/http/client.py in begin(self)
295 while True:
--> 296 version, status, reason = self._read_status()
297 if status != CONTINUE:
~/anaconda3/lib/python3.7/http/client.py in _read_status(self)
264 # sending a valid response.
--> 265 raise RemoteDisconnected("Remote end closed connection without"
266 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-3-5f0caa95c89a> in <module>
2 #Using following code we can fake the agent.
3 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
----> 4 response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Server doesn't seem to like the user-agent you provided. You can shorten as follows:
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)

API requests aborted

First of all, I can't expose my API keys due to privacy reasons, sorry about that. Let me know how I could better explain the situation.
On high level, here is my Python code:
for x in range(0, len(zone_ids)):
time.sleep(2)
response = requests.post(f'https://vemcount.app/api/v3/report?source=zones&data=
{zone_ids[x]}&data_output=count_out&period=date&form_date_from=2020-01-04&form_date_to=2020-01-
04&period_step=30min&show_hours_from=00:00&show_hours_to=23:45', headers=headers)
The problem is that, this code sometimes run successfully, sometime fails. When it fails, some IDs/requests are actually run, it just stopped in the middle of for loop for any particular "random" request. I'm pretty sure it hasn't exceeded the API rate limit of 60 requests per min as I have embedded time.sleep(2) in my for loop. Here is the message:
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
~\Anaconda3\envs\sa\lib\site-packages\urllib3\packages\six.py in raise_from(value, from_value)
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
~\Anaconda3\envs\sa\lib\http\client.py in getresponse(self)
1346 try:
-> 1347 response.begin()
1348 except ConnectionError:
~\Anaconda3\envs\sa\lib\http\client.py in begin(self)
306 while True:
--> 307 version, status, reason = self._read_status()
308 if status != CONTINUE:
~\Anaconda3\envs\sa\lib\http\client.py in _read_status(self)
275 # sending a valid response.
--> 276 raise RemoteDisconnected("Remote end closed connection without"
277 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~\Anaconda3\envs\sa\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
754
--> 755 retries = retries.increment(
756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
~\Anaconda3\envs\sa\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
530 if read is False or not self._is_method_retryable(method):
--> 531 raise six.reraise(type(error), error, _stacktrace)
532 elif read is not None:
~\Anaconda3\envs\sa\lib\site-packages\urllib3\packages\six.py in reraise(tp, value, tb)
733 if value.__traceback__ is not tb:
--> 734 raise value.with_traceback(tb)
735 raise value
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
~\Anaconda3\envs\sa\lib\site-packages\urllib3\packages\six.py in raise_from(value, from_value)
~\Anaconda3\envs\sa\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
~\Anaconda3\envs\sa\lib\http\client.py in getresponse(self)
1346 try:
-> 1347 response.begin()
1348 except ConnectionError:
~\Anaconda3\envs\sa\lib\http\client.py in begin(self)
306 while True:
--> 307 version, status, reason = self._read_status()
308 if status != CONTINUE:
~\Anaconda3\envs\sa\lib\http\client.py in _read_status(self)
275 # sending a valid response.
--> 276 raise RemoteDisconnected("Remote end closed connection without"
277 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
c:\Users\Vzhao\repos\SocialMediaAnalytics\2_etl_scripts\Vemcount_Historical_Data.py in
74 s = requests.Session()
75 s.mount('https://', MyAdapter())
---> 76 response = s.post(f'https://vemcount.app/api/v3/report?source=zones&data={zone_ids[x]}&data_output=count_out&period=date&form_date_from=2020-01-04&form_date_to=2020-01-04&period_step=30min&show_hours_from=00:00&show_hours_to=23:45', headers=headers)
77 # response = requests.post(f'https://vemcount.app/api/v3/report?source=zones&data={zone_ids[x]}&data_output=count_out&period=date&form_date_from=2020-01-02&form_date_to=2020-01-02&period_step=30min&show_hours_from=00:00&show_hours_to=23:45', headers=headers)
78 vemcount = json.loads(response.text)
~\Anaconda3\envs\sa\lib\site-packages\requests\sessions.py in post(self, url, data, json, **kwargs)
588 """
589
--> 590 return self.request('POST', url, data=data, json=json, **kwargs)
591
592 def put(self, url, data=None, **kwargs):
~\Anaconda3\envs\sa\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
~\Anaconda3\envs\sa\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
~\Anaconda3\envs\sa\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

Python can't request with proxies to most servers

I got the following code, proxy is a public proxy I found on this site.
import requests
proxies = {
"https": "http://27.203.242.127:8060",
"http": "http://27.203.242.127:8060"}
url = "http://httpbin.org/ip"
url2 = "https://www.google.com/"
r1 = requests.get(url, proxies=proxies, headers={'User-Agent': 'Mozilla'})
r2 = requests.get(url2, proxies=proxies, headers={'User-Agent': 'Mozilla'})
r1 runs just fine but I get a long error message when I try to run r2.
First error I get is "BadStatusLine" and from there a few "During handling of the above exception, another exception occurred:" spawning ProtocolError and ConnectionError.
Full traceback:
---------------------------------------------------------------------------
BadStatusLine Traceback (most recent call last)
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
661 if is_new_proxy_conn:
--> 662 self._prepare_proxy(conn)
663
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _prepare_proxy(self, conn)
947 conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers)
--> 948 conn.connect()
949
~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
307 # self._tunnel_host below.
--> 308 self._tunnel()
309 # Mark this connection as not reusable
~\anaconda3\lib\http\client.py in _tunnel(self)
915 response = self.response_class(self.sock, method=self._method)
--> 916 (version, code, message) = response._read_status()
917
~\anaconda3\lib\http\client.py in _read_status(self)
287 self._close_conn()
--> 288 raise BadStatusLine(line)
289
BadStatusLine: <html>
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
399 if read is False or not self._is_method_retryable(method):
--> 400 raise six.reraise(type(error), error, _stacktrace)
401 elif read is not None:
~\anaconda3\lib\site-packages\urllib3\packages\six.py in reraise(tp, value, tb)
733 if value.__traceback__ is not tb:
--> 734 raise value.with_traceback(tb)
735 raise value
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
661 if is_new_proxy_conn:
--> 662 self._prepare_proxy(conn)
663
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _prepare_proxy(self, conn)
947 conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers)
--> 948 conn.connect()
949
~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
307 # self._tunnel_host below.
--> 308 self._tunnel()
309 # Mark this connection as not reusable
~\anaconda3\lib\http\client.py in _tunnel(self)
915 response = self.response_class(self.sock, method=self._method)
--> 916 (version, code, message) = response._read_status()
917
~\anaconda3\lib\http\client.py in _read_status(self)
287 self._close_conn()
--> 288 raise BadStatusLine(line)
289
ProtocolError: ('Connection aborted.', BadStatusLine('<html>\r\n'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-77-db3266a345e8> in <module>
7 url = "http://httpbin.org/ip"
8 url2 = "https://www.google.com/"
----> 9 r = requests.get(url2, proxies=proxies, headers={'User-Agent': 'Mozilla'})
~\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', BadStatusLine('<html>\r\n'))
Sometimes you may need to initiate a session for certain request. I have no idea if this is one, but I'd give that a go:
s = requests.Session()
s.proxies = {
“http”: “http://10.10.10.10:8000”,
“https”: “http://10.10.10.10:8000”,
}
r = s.get(“http://toscrape.com”)
This is referenced from here. Hope that helps!

Remote end closed connection without response (Python 3- Bugzilla)

I am currently working on a project that involves python-bugzilla module.
When I try to collect some bug data, I get the following error :
RemoteDisconnected : Remote end closed connection without response
api = Bugzilla(url)
product = ...
request = api.build_query(product=product, include_fields=["id"])
data = api.query(request)
ids = np.array([bug.id for bug in data]).reshape(-1)
n = ids.shape[0]
q = 500 #size of bug package
if q < n :
m = n%q
k = (n+q-m)/q
else:
k = n
ids_splitted = np.array_split(ids, k)
bugs = []
for ids_ in ids_splitted:
bugs = bugs + api.getbugs(ids_)
The complete error is :
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in getresponse(self)
1320 try:
-> 1321 response.begin()
1322 except ConnectionError:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in begin(self)
295 while True:
--> 296 version, status, reason = self._read_status()
297 if status != CONTINUE:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in _read_status(self)
264 # sending a valid response.
--> 265 raise RemoteDisconnected("Remote end closed connection without"
266 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
366 if read is False or not self._is_method_retryable(method):
--> 367 raise six.reraise(type(error), error, _stacktrace)
368 elif read is not None:
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
684 if value.__traceback__ is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in getresponse(self)
1320 try:
-> 1321 response.begin()
1322 except ConnectionError:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in begin(self)
295 while True:
--> 296 version, status, reason = self._read_status()
297 if status != CONTINUE:
~/anaconda3/envs/DataScience/lib/python3.7/http/client.py in _read_status(self)
264 # sending a valid response.
--> 265 raise RemoteDisconnected("Remote end closed connection without"
266 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-9-820dd90f4151> in <module>
1 t0 = time()
2 request = api.build_query(product=product, include_fields=["id"])
----> 3 data = api.query(request)
4 ids = np.array([bug.id for bug in data]).reshape(-1)
5 n = ids.shape[0]
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/bugzilla/base.py in query(self, query)
1263 """
1264 try:
-> 1265 r = self._proxy.Bug.search(query)
1266 except Fault as e:
1267
~/anaconda3/envs/DataScience/lib/python3.7/xmlrpc/client.py in __call__(self, *args)
1110 return _Method(self.__send, "%s.%s" % (self.__name, name))
1111 def __call__(self, *args):
-> 1112 return self.__send(self.__name, args)
1113
1114 ##
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/bugzilla/transport.py in _ServerProxy__request(self, methodname, params)
102 # pylint: disable=no-member
103 ret = super(_BugzillaServerProxy,
--> 104 self)._ServerProxy__request(methodname, params)
105 # pylint: enable=no-member
106
~/anaconda3/envs/DataScience/lib/python3.7/xmlrpc/client.py in __request(self, methodname, params)
1450 self.__handler,
1451 request,
-> 1452 verbose=self.__verbose
1453 )
1454
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/bugzilla/transport.py in request(self, host, handler, request_body, verbose)
199 request_body = request_body.replace(b'\r', b'
')
200
--> 201 return self._request_helper(url, request_body)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/bugzilla/transport.py in _request_helper(self, url, request_body)
162 try:
163 response = self.session.post(
--> 164 url, data=request_body, **self.request_defaults)
165
166 # We expect utf-8 from the server
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/requests/sessions.py in post(self, url, data, json, **kwargs)
579 """
580
--> 581 return self.request('POST', url, data=data, json=json, **kwargs)
582
583 def put(self, url, data=None, **kwargs):
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/envs/DataScience/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Can someone help me to fix this ? This is really strange as my code worked something like 2 weeks ago...
EDIT: I ran the code some times, and I noticed that it can either run perfectly or be stopped ... So, I assume it is not due to my code but to another thing. Since I am not an expert, I do not know can be the cause of this ... So, if someone can explain this to me, it will be great
EDIT2: If someone has a same issue, that is due to the server activity. Some servers limit the frequency of calling. To work around, one would try to sleep the code for let's say 1 second, that should do the job. But, the runtime will be increased.

How to fix 'Connection aborted.' error in Python with BeautifulSoup

I had been running this code daily for weeks with no error. This morning, it ran the for loop over 100 times properly, then gave a connection issue. Each time I have tried to run it since, it will run anywhere from 5 to 130 times, but always gives the connection error before completing.
I am still getting status codes of 200. I've seen some posts referencing 'memory leak' issues in Python, but I'm not sure how to figure out if that's the problem here. It's also strange because it had been working fine until today.
I have similar code for other pages on the same site that still runs correctly all the way through.
Here is the code:
import requests
from bs4 import BeautifulSoup
updates = []
print(f'Getting {total_timebanks} timebank details... ')
for timebank in range(len(timebanks)):
url = f"http://community.timebanks.org/{timebanks['slug'][timebank]}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
update = {}
update['members'] = soup.find('div', {'class': 'views-field-field-num-users-value'}).span.text.strip().replace(',', '')
updates.append(update)
time.sleep(1)
And here is the full error message:
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in getresponse(self)
1330 try:
-> 1331 response.begin()
1332 except ConnectionError:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in _read_status(self)
265 # sending a valid response.
--> 266 raise RemoteDisconnected("Remote end closed connection without"
267 " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
366 if read is False or not self._is_method_retryable(method):
--> 367 raise six.reraise(type(error), error, _stacktrace)
368 elif read is not None:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
684 if value.__traceback__ is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383 # otherwise it looks like a programming error was the cause.
--> 384 six.raise_from(e, None)
385 except (SocketTimeout, BaseSSLError, SocketError) as e:
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 try:
--> 380 httplib_response = conn.getresponse()
381 except Exception as e:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in getresponse(self)
1330 try:
-> 1331 response.begin()
1332 except ConnectionError:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in begin(self)
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
/anaconda3/envs/DSI-6/lib/python3.6/http/client.py in _read_status(self)
265 # sending a valid response.
--> 266 raise RemoteDisconnected("Remote end closed connection without"
267 " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-17-31257fee2c23> in <module>
5 for timebank in range(len(timebanks)):
6 url = f"http://community.timebanks.org/{timebanks['slug'][timebank]}"
----> 7 res = requests.get(url, headers=headers)
8 soup = BeautifulSoup(res.content, 'lxml')
9
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
/anaconda3/envs/DSI-6/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496
497 except (ProtocolError, socket.error) as err:
--> 498 raise ConnectionError(err, request=request)
499
500 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
The problem seems to have resolved itself. With no changes to the code, it is back to running as expected this morning.
I don't have much insight as to why I had connection errors yesterday, but it does seem to have been an issue with the site, not the code.
Thanks for the responses! For reference, I had also tried increasing sleep timer to 30, but that did not resolve the problem yesterday.

Categories