popups data scraping on given page range

popups data scraping on given page range - python

I have a code to parse a popup info(emails), but I am unable to go on every page. It's only scraping one page. So how could I add range of pages on this code ?
import re
import requests
from bs4 import BeautifulSoup
base_url = 'https://tenders.procurement.gov.ge/public/?lang=en'
url = 'https://tenders.procurement.gov.ge/public/library/controller.php?action=org_list'
profile_url = 'https://tenders.procurement.gov.ge/public/library/controller.php?action=profile&org_id='
num = re.compile(r'(\d+)')
with requests.session() as s:
# load cookies:
s.get(base_url)
soup = BeautifulSoup(s.get(url).content, 'html.parser')
for tr in soup.select('tr[onclick]'):
n = num.search(tr['onclick']).group(1)
soup2 = BeautifulSoup(s.get(profile_url + n).content, 'html.parser')
email = soup2.select_one('td:contains("E-Mail") + td')
print(email.text)
This code was offered by Andrej Keseley and many thanks to him.
So what will be the addition to this code to scrape pages in given range?

You can try this code to load next pages:
import re
import requests
from bs4 import BeautifulSoup
base_url = 'https://tenders.procurement.gov.ge/public/?lang=en'
url = 'https://tenders.procurement.gov.ge/public/library/controller.php?action=org_list'
profile_url = 'https://tenders.procurement.gov.ge/public/library/controller.php?action=profile&org_id='
next_url = 'https://tenders.procurement.gov.ge/public/library/controller.php?action=org_list&search_org_type=0&page=next&blacklisted=0'
num = re.compile(r'(\d+)')
with requests.session() as s:
# load cookies:
s.get(base_url)
soup = BeautifulSoup(s.get(url).content, 'html.parser')
while True:
if not soup.select('tr[onclick]'):
break
for tr in soup.select('tr[onclick]'):
n = num.search(tr['onclick']).group(1)
soup2 = BeautifulSoup(s.get(profile_url + n).content, 'html.parser')
email = soup2.select_one('td:contains("E-Mail") + td')
print(email.text)
soup = BeautifulSoup(s.get(next_url).content, 'html.parser')

Thanks Andrej, your code is working as always! ;) but after about 9000 emails i am getting an error, so how could I add any variable of ranges of pages ? for example I need from 9000 emails to 12000 emails (or same thing with pages ranges)
Traceback (most recent call last):
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connection.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\util\connection.py", line 83, in create_connection
raise err
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connectionpool.py", line 345, in _make_request
self._validate_conn(conn)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connectionpool.py", line 844, in _validate_conn
conn.connect()
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connection.py", line 284, in connect
conn = self._new_conn()
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connection.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x00000074893451D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\requests\adapters.py", line 440, in send
timeout=timeout
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\connectionpool.py", line 649, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\urllib3\util\retry.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='tenders.procurement.gov.ge', port=443): Max retries exceeded with url: /public/library/controller.php?action=profile&org_id=67358 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000074893451D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/Dell/PycharmProjects/Scrap/main.py", line 25, in
soup2 = BeautifulSoup(s.get(profile_url + n).content, 'html.parser')
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\requests\sessions.py", line 536, in get
return self.request('GET', url, **kwargs)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\requests\sessions.py", line 523, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Dell\PycharmProjects\Scrap\venv\lib\site-packages\requests\adapters.py", line 504, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='tenders.procurement.gov.ge', port=443): Max retries exceeded with url: /public/library/controller.php?action=profile&org_id=67358 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000074893451D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

Related

TimeoutError: Large amount of data in requests python

I am trying to make a script that scrape presentation from a slideshare link and download it as a PDF.
The script is working fine, until the total slides are under 20. Is there any alternative to requests in python that can do the job.
Here is the scripts:
import requests
from bs4 import BeautifulSoup
from PIL import Image
import io
URL_LESS = "https://www.slideshare.net/angelucmex/global-warming-2373190?qid=8f04572c-48df-4f53-b2b0-0eb71021931c&v=&b=&from_search=1"
URL="https://www.slideshare.net/tusharpanda88/python-basics-59573634?qid=03cb80ee-36f0-4241-a516-454ad64808a8&v=&b=&from_search=5"
r = requests.get(URL_LESS)
soup = BeautifulSoup(r.content, "html5lib")
imgs = soup.find_all('img', class_="slide-image")
imgSRC = [x.get("srcset").split(',')[0].strip().split(' ')[0].split('?')[0] for x in imgs]
imagesJPG = []
for img in imgSRC:
im = requests.get(img)
f = io.BytesIO(im.content)
imgJPG = Image.open(f)
imagesJPG.append(imgJPG)
imagesJPG[0].save(f"{soup.title.string}.pdf",save_all=True, append_images=imagesJPG[1:])
Try changing URL_LESS to URL, you will get the idea.
Here is the traceback
Traceback (most recent call last):
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connection.py", line 174, in _new_conn
conn = connection.create_connection(
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\util\connection.py", line 95, in create_connection
raise err
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connectionpool.py", line 1040, in _validate_conn
conn.connect()
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connection.py", line 358, in connect
conn = self._new_conn()
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connection.py", line 186, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000259643FF820>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\adapters.py", line 440, in send
resp = conn.urlopen(
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\connectionpool.py", line 785, in urlopen
retries = retries.increment(
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\urllib3\util\retry.py", line 592, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='image.slidesharecdn.com', port=443): Max retries exceeded with url: /pythonbasics-160315100530/85/python-basics-8-320.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000259643FF820>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "d:\Work\py\scrapingScripts\slideshare\main.py", line 16, in <module>
im = requests.get(img)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\sessions.py", line 529, in request
resp = self.send(prep, **send_kwargs)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\sessions.py", line 645, in send
r = adapter.send(request, **kwargs)
File "D:\Work\py\scrapingScripts\tkinter\env\lib\site-packages\requests\adapters.py", line 519, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='image.slidesharecdn.com', port=443): Max retries exceeded with url: /pythonbasics-160315100530/85/python-basics-8-320.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000259643FF820>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did
not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

The script worked perfectly for me both when using URL and URL_LESS, so your internet might be the culprit here.
My guesses are:
You're having a slow/inconsistent internet.
Slideshare is blacklisting your IP/ web-agent maybe for DDOS protection.(unlikely)
You're Using ipv6, which has been the culprit in these kind of cases for me, try switching your network to use ipv4 only.
and when it comes to requests, I have personally used it to scrape a fairly large amount of data for a fairly long time so I can say it's an amazing library to use

Failed to establish a new connection: [Errno 11001] getaddrinfo failed' in Python 3.9

This script is used to send data to a 3rd party. It seems to run fine when connected to any network other than my work network. When connected to my work network, it will run for a few minutes, PUT/POSTing as it should, and then suddenly the following error will appear:
File "C:\Users\user\AppData\Local\RPackages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 169, in _new_conn
conn = connection.create_connection(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\util\connection.py", line 73, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.2032.0_x64__qbz5n2kfra8p0\lib\socket.py", line 954, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 353, in connect
conn = self._new_conn()
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 181, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\util\retry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='vaccine-verify-v1.services.school.edu', port=443): Max retries exceeded with url: /api/verify (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\scripture\bali_vax.py", line 71, in <module>
post_vax_record(row)
File "C:\scripture\bali_vax.py", line 38, in post_vax_record
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\api.py", line 117, in post
return request('post', url, data=data, json=json, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='vaccine-verify-v1.services.school.edu', port=443): Max retries exceeded with url: /api/verify (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Script
import timeit
start = timeit.default_timer()
import requests
url = 'https://vaccine-verify-v1.services.school.edu/api/verify'
client_id = "abc"
client_secret = "123"
def post_vax_record(record):
manuallyVerified = record[7].lower() == "true";
vaxrecord = { "firstName":record[0], "lastName": record[1], "dob": record[2], "phoneNumber": (record[3] or '4444444444'), "state": record[4], "zip": record[5][:5], "campusName": "Baltimore", "instituitionId": 0, "campusSuCd": 28, "sunyLanId": record[6], "sunyStudId": 0, "ssoId": "BEL-"+record[6], "manuallyVerified": manuallyVerified }
headers = {'Client_Id':client_id, 'Client_Secret':client_secret,'Content-Type':'application/json'}
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
if response:
json_response = response.json()
if ('message' in json_response and json_response['message'] == 'The Student record already exists. Please use put operation to insert the record.'): #changed from update to insert
print ('Post failed. Trying put instead')
response = requests.put(url,json=vaxrecord, headers=headers)
if response:
print('Success on put for '+record[6])
else:
print('Put failed for '+record[6])
else:
print('Success on post for '+record[6])
print(response.json())
else:
print('Post failed for '+record[6]+' with error code: ')
print(response.status_code)
import csv
with open('data_file.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
if row[6].upper().split('#')[1] in ("LIVE.LODI.EDU", "LODI.EDU"):
if row[0].upper() != "FIRSTNAME":
row[6] = row[6].upper().split('#')[0]
post_vax_record(row)
else:
print('NON-LODI EMAIL FOUND FOR '+row[1]+', '+row[0]+'. WILL NOT PROCESS RECORD!')
stop = timeit.default_timer()
print('Run Time: ', stop - start)
While testing this script, and using smaller CSVs (with less records to read in), I never had an issue. Just recently has this been occurring. Any help is greatly appreciated.
Thank you.

Please wrap response = requests.post(url,json=vaxrecord, headers=headers, timeout=5) line with try-catch block
while retry_counter < 5:
try:
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
except socket.error as error:
print("Connection Failed due to socket - {}").format(error)
print("Attempting {} of 5").format(retry_counter)
time.sleep(3)
retry_counter += 1
NOTE: Generally retrying code after some time in socket error case will resolve the issue(happened in my case) hence the below solution
while doing request.post, if socket.error occurs then retry the code 3-5 time with a delay of your 1-3 seconds.

Connection problem and script broke when scraping

Have problem with connection maybe its fault from my internet but what I can do?
I create module for scraping pages.
like:
def scrape(page):
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
url = page
result = session.get(url)
# result = requests.get(url, stream=True)
if result.status_code == 200:
#soup dla całej treści, bez opisu. on jest ładowany osobno w soup2 z faktu, że jest wczytywany przez js
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
and in second script i send url's adress from csv file its like 8000 links
and sometimes my script broke and send communicate like this:
raceback (most recent call last):
File "C:\Python\Python36\lib\site-packages\urllib3\connection.py", line 157, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "C:\Python\Python36\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
raise err
File "C:\Python\Python36\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] Próba połączenia nie powiodła się, ponieważ połączona strona nie odpowiedziała poprawnie po ustalonym okresie czasu lub utworzone połączenie nie powiodło się, ponieważ połączony host nie odpowiedział
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "C:\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 994, in _validate_conn
conn.connect()
File "C:\Python\Python36\lib\site-packages\urllib3\connection.py", line 334, in connect
conn = self._new_conn()
File "C:\Python\Python36\lib\site-packages\urllib3\connection.py", line 169, in _new_conn
self, "Failed to establish a new connection: %s" % e
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x00000242E2F1CE80>: Failed to establish a new connection: [WinError 10060] Próba połączenia nie powiodła się, ponieważ połączona strona nie odpowiedziała poprawnie po ustalonym okresie czasu lub utworzone połączenie nie powiodło się, ponieważ połączony host nie odpowiedział
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python\Python36\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "C:\Python\Python36\lib\site-packages\urllib3\util\retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='gorabbit.pl', port=443): Max retries exceeded with url: /dep-551-1153lr-ld-em-001.html (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000242E2F1CE80>: Failed to establish a new connection: [WinError 10060] Próba połączenia nie powiodła się, ponieważ połączona strona nie odpowiedziała poprawnie po ustalonym okresie czasu lub utworzone połączenie nie powiodło się, ponieważ połączony host nie odpowiedział',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:/Nomax/Rabit/GorabitToCSV.py", line 217, in <module>
map = read_csv(plik_wsadowy)
File "E:/Nomax/Rabit/GorabitToCSV.py", line 176, in read_csv
object_rabit = scrape(row[1])
File "E:\Nomax\Rabit\scraping_module.py", line 24, in scrape
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
File "C:\Python\Python36\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Python\Python36\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python\Python36\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python\Python36\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Python\Python36\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='gorabbit.pl', port=443): Max retries exceeded with url: /dep-551-1153lr-ld-em-001.html (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000242E2F1CE80>: Failed to establish a new connection: [WinError 10060] Próba połączenia nie powiodła się, ponieważ połączona strona nie odpowiedziała poprawnie po ustalonym okresie czasu lub utworzone połączenie nie powiodło się, ponieważ połączony host nie odpowiedział',))
How i can avoid it. I change normal requests to session but some times its fail too.
Like some times error show in 6000 url sometimes when its only 400 pages

it is unclear to me from your description just how relyable your connection really is, so let's jost go for a very general solusion:
def scrape(page):
try:
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
url = page
result = session.get(url)
# result = requests.get(url, stream=True)
if result.status_code == 200:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
except:
print("probably connection error...")
if you wnt to try again and again just do:
def scrape_env(page):
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
url = page
result = session.get(url)
# result = requests.get(url, stream=True)
if result.status_code == 200:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def scrape(page, num_of_tries):
try:
scrape_env(page)
except:
if num_of_tries !=0:
print("probably connection error... try number ", num_of_tries)
num_of_tries -=1
scrape(page, num_of_tries)
else:
print("could not scrape, sorry")

How to check if there is internet connection

I Have to check if my computer is connected to internet (so if i open the broswer i can or i cannot visit an url) , for do this task i tried with some codes:
The first is
import socket
def internet_on():
try:
print("checking internet connection..")
socket.setdefaulttimeout(5)
host = socket.gethostbyname("www.google.com")
s = socket.create_connection((host, 80), 2)
s.close()
print('internet on.')
return True
except Exception as e:
print(e)
print("internet off.")
return False
internet_on()
code that i took from this answer Checking internet connection with Python
After i tried with this:
from urllib.request import urlopen
def internet_on():
try:
urlopen("https://www.instagram.com/", timeout=5)
return True
except Exception as err:
print(str(err))
return False
internet_on()
code that i took from this answer Checking network connection
And this
import socket
REMOTE_SERVER = "www.google.com"
def internet_on(hostname):
try:
# see if we can resolve the host name -- tells us if there is
# a DNS listening
host = socket.gethostbyname(hostname)
# connect to the host -- tells us if the host is actually
# reachable
s = socket.create_connection((host, 80), 2)
return True
except Exception:
return False
internet_on(REMOTE_SERVER)
code that i took from this answer Test if an internet connection is present in python
If the connection is active the codes work fines
but when there is no connection all this codes raise the same errors:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\util\connection.py", line 57, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Program Files\Python37-32\lib\socket.py", line 748, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 343, in _make_request
self._validate_conn(conn)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 839, in _validate_conn
conn.connect()
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 301, in connect
conn = self._new_conn()
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 168, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\util\retry.py", line 398, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /_exploreurself (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 835, in <module>
bot.unfollow_process(RANDOM, sense=UP_DOWN, following_target=0, sleep_time=60)
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 651, in unfollow_process
current_following = self.user_following_num(self._username)
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 387, in user_following_num
r = requests.get(url).text
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /_exploreurself (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

Try this one here IP address 216.58.192.142 is one of Google's IP address. Due to static IP address, this code is not robust and may not work always so, replace this IP address with some other website which you believe respond more faster.
The reason why the code uses a fixed IP address instead of a fully qualified domain name (FQDN) is because a FQDN would require a DNS lookup. When the machine does not have a working internet connection, the DNS lookup itself may block the call to urllib_request.urlopen for more than a second.
import urllib2
def internet_on():
try:
urllib2.urlopen('http://216.58.192.142', timeout=1)
return True
except urllib2.URLError as err:
return False

Try to run a ping to www.google.com with the library 'subprocess', an example of code below. This solution is an indirect one, but sometimes is better to delegate in system commands to do certain jobs like this one.
import subprocess
def my_dir(my_path):
output=''
try:
p = subprocess.Popen('ping '+my_path, stdout = subprocess.PIPE, stderr = subprocess.STDOUT,
#close_fds = True, #not valid for Windows platforms
shell = True
)
output, err = p.communicate()
#print(output)
finally:
if p is not None:
try: p.kill()
except: pass
return output
print(my_dir('www.google.com'))
Then, you can parse its output to know if you have reached Google's servers or not.

Webscraping with Python3.7: ConnectionError: HTTPSConnectionPool(host='www.google.com', port=443):

I want to scrape web results from google.com. I followed the first answer from this questions, Google Search Web Scraping with Python. Unfortunately I am getting connecting error. I happened to check with other websites too, its not connecting . Is it because of the corporate proxy settings?
Please note that i am using virtual env "Webscraping".
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from requests import get
raw = get("https://www.google.com/search?q=StackOverflow").text
page = fromstring(raw)
for result in page.cssselect(".r a"):
url = result.get("href")
if url.startswith("/url?"):
url = parse_qs(urlparse(url).query)['q']
print(url[0])
raw = get("https://www.google.com/search?q=StackOverflow").text
Traceback (most recent call last):
File "", line 1, in
raw = get("https://www.google.com/search?q=StackOverflow").text
File
"c:\users\appdata\local\programs\python\python37\webscraping\lib\site-packages\requests\api.py",
line 75, in get
return request('get', url, params=params, **kwargs)
File
"c:\users\appdata\local\programs\python\python37\webscraping\lib\site-packages\requests\api.py",
line 60, in request
return session.request(method=method, url=url, **kwargs)
File
"c:\users\appdata\local\programs\python\python37\webscraping\lib\site-packages\requests\sessions.py",
line 524, in request
resp = self.send(prep, **send_kwargs)
File
"c:\users\appdata\local\programs\python\python37\webscraping\lib\site-packages\requests\sessions.py",
line 637, in send
r = adapter.send(request, **kwargs)
File
"c:\users\appdata\local\programs\python\python37\webscraping\lib\site-packages\requests\adapters.py",
line 516, in send
raise ConnectionError(e, request=request)
ConnectionError: HTTPSConnectionPool(host='www.google.com', port=443):
Max retries exceeded with url: /search?q=StackOverflow (Caused by
NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object
at 0x0000021B79768748>: Failed to establish a new connection:
[WinError 10060] A connection attempt failed because the connected
party did not properly respond after a period of time, or established
connection failed because connected host has failed to respond'))
Please advise. Thanks
EDIT: I tried pining google.com, it fails.
import os
hostname = "https://www.google.com" #example
response = os.system("ping -c 1 " + hostname)
#and then check the response...
if response == 0:
print(hostname, 'is up!')
else:
print(hostname, 'is down!')
https://www.google.com is down!

I think you are getting this error because of your proxy setting.
Try run one of the following commands in command prompt
set http_proxy=http://proxy_address:port
set http_proxy=http://user:password#proxy_address:port
set https_proxy=https://proxy_address:port
set https_proxy=https://user:password#proxy_address:port

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

popups data scraping on given page range - python

Related

TimeoutError: Large amount of data in requests python

Failed to establish a new connection: [Errno 11001] getaddrinfo failed' in Python 3.9

Connection problem and script broke when scraping

How to check if there is internet connection

Webscraping with Python3.7: ConnectionError: HTTPSConnectionPool(host='www.google.com', port=443):

Categories

Resources