I want to scrape the linkedin profiles based on specific keywords but got the error here is my code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from tkinter import *
class Linkedin():
def getData(self):
driver = webdriver.Chrome('/home/danish-khan/scrapers/researchgate/chromedriver')
driver.get('https://www.linkedin.com/login')
driver.find_element_by_id('username').send_keys('danishkhankd237#gmail.com') #Enter username of linkedin account here
driver.find_element_by_id('password').send_keys('dankhanish446') #Enter Password of linkedin account here
driver.find_element_by_xpath("//button[#type='submit']").click()
#*********** Search Result ***************#
search_key = "data analyst" # Enter your Search key here to find people
key = search_key.split()
print('\nkeyword:', key)
keyword = ""
for key1 in key:
keyword = keyword + str(key1).capitalize() +"%20"
keyword = keyword.rstrip("%20")
print('\nkeyword2 :', keyword)
#global data
data = []
profile_links = []
for no in range(1,3):
start = "&page={}".format(no)
search_url = "https://www.linkedin.com/search/results/people/?keywords={}&origin=SUGGESTION{}".format(keyword,start)
driver.get(search_url)
# driver.maximize_window()
for scroll in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
search = BeautifulSoup(driver.page_source,'lxml')
for people in search.findAll('span', class_ = 't-16'):
profiles = people.find_all('a', attrs = {'class' : 'app-aware-link'})
count = 0
for i in profiles:
profiles2 = i['href']
print(profiles2)
profile_links.append(profiles2)
print("Going to scrape Page {} data".format(no))
print('\nprofile_links :', profile_links)
lent = 0
for people in profile_links:
#count = 0
# if count%2==0:
# lent+=1
print('Profile :', people)
driver.get(people)
print('\ngetting\n')
# #********** Profile Details **************#
card = BeautifulSoup(driver.page_source,'lxml')
try:
Name = card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text
except:
Name = 'None'
try:
Work_at = (card.find('div', attrs = {'class' : 'text-body-medium break-words'}).text).strip()
except:
Work_at = "None"
try:
Image = card.find("img", attrs = {'loading' : 'lazy'})['src']
except:
Image = 'None'
try:
Education = card.find('h3', attrs = {'class' : 'pv-entity__school-name t-16 t-black t-bold'}).text
except:
Education = 'None'
try:
Location = soup.find('span', attrs = {'class' : 'text-body-small inline t-black--light break-words'}).text.strip()
except:
Location = 'None'
details = {
'Name' : 'hgf', #card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text,
'Location' : '',
'Work_at' : '',
'Education' : '',
'Profile_image' : '',
'Website' : '',
'Email' : ''
}
details['Name'] = Name
print(details)
time.sleep(15)
driver.quit()
driver.quit()
def start(self):
self.getData()
if __name__ == "__main__":
obJH = Linkedin()
obJH.start()
firstly i want to collect all the url of the user profiles of specific kewyords like here data analyst and then go through all the profile urls to scrape specific data from these profiles but it only scrape two urls and not all the profiles urls and seconds when going through the list of urls i got the error:
python linkdn2.py
keyword: ['data', 'analyst']
keyword2 : Data%20Analyst
https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
getting
{'Name': 'Roshaan Khan', 'Location': '', 'Work_at': '', 'Education': '', 'Profile_image': '', 'Website': '', 'Email': ''}
https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck', 'https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 387, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/lib/python3.8/http/client.py", line 950, in send
self.connect()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 187, in connect
conn = self._new_conn()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "linkdn2.py", line 108, in <module>
obJH.start()
File "linkdn2.py", line 104, in start
self.getData()
File "linkdn2.py", line 55, in getData
driver.get(people)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 79, in request
return self.request_encode_body(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 171, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/poolmanager.py", line 330, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56707): Max retries exceeded with url: /session/b7431e8051979e6a9a308bdfd59bf60a/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused'))
I have tried many ways to solve this but can't find solution.
Related
Here is a class I wrote to get billboard hot 100 songs by date.
The class uses requests to get website html text
It then uses beautifulsoup to parse the html
The parsing works well
the problem is the intermittent connection errors
import json
import time
from bs4 import BeautifulSoup
import requests
import datetime as DT
class BillBoardScraper():
def __init__(self) -> None:
self.top_100 = None
self.scraped_chart = None
def _scrape_chart(self, date):
url = 'https://www.billboard.com/charts/hot-100'
headers = {
"User-Agent": "Mozilla/5.0"
}
r = requests.get(
f'{url}/{date}', headers=headers)
bill_board_100_soup = BeautifulSoup(r.text, 'html.parser')
r = None
bill_board_100_results_soup = bill_board_100_soup.find_all(
"div", "o-chart-results-list-row-container")
return bill_board_100_results_soup
def _get_song_and_artist(self, idx):
for result_item in self.scraped_chart[idx].find_all('li'):
segment_struct = [tag.name for tag in result_item if tag.name]
if segment_struct == ['h3', 'span']:
song_and_artist = []
for tag in result_item:
if tag.string.strip():
song_and_artist.append(tag.string.strip())
return song_and_artist
def run_parser_and_archive_data(self, date):
self.top_100 = {}
self.scraped_chart = self._scrape_chart(date)
for i in range(0, 100):
song, artist = self._get_song_and_artist(i)
self.top_100[i+1] = {"track": song, "artist": artist, "date": date}
json_string = json.dumps(self.top_100)
with open(f'data_billboard/billboard_hot100_{date}.json', 'w') as outfile:
json.dump(json_string, outfile)
date = DT.date(2010, 3, 19)
n_weeks = 520
c_week = 1
while c_week <= n_weeks:
print(str(date))
top100 = BillBoardScraper()
top100.run_parser_and_archive_data(str(date))
date = date - DT.timedelta(days=7)
time.sleep(10)
Sporadically I receive the following error. Why does this happen? What can I do to mitigate this? Any feedback is appreciated
Traceback (most recent call last):
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 275, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\util\retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
raise value.with_traceback(tb)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 275, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "d:\airflow\plugins\api_billboard100.py", line 55, in <module>
top100.run_parser_and_archive_data(str(date))
File "d:\airflow\plugins\api_billboard100.py", line 39, in run_parser_and_archive_data
self.scraped_chart = self._scrape_chart(date)
File "d:\airflow\plugins\api_billboard100.py", line 20, in _scrape_chart
f'{url}/{date}', headers=headers)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
I am trying to get screenshots of a website for a project I am doing. However, when I run my code, it works the first time and gives me the screenshot. However, when the code loops to take another screenshot, a very long error message comes up saying that the connection has been refused. I am using python three and selenium on a macbook air
Here is my code
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36')
driver = webdriver.Chrome('/Library/Frameworks/Python.framework/Versions/3.8/bin/chromedriver', options=options)
URL1 = 'https://www.accuweather.com/en/ie/moroe/1079356/current-weather/1079356'
URL2 = 'https://www.accuweather.com/en/ie/moroe/1079356/hourly-weather-forecast/1079356'
URL3 = 'https://weather.com/en-IE/weather/today/l/d71e95387799a552a061ec1550ac876dcc19b5d139adc6f51ba3b8bf7a6b96ed'
URL4 = 'https://weather.com/en-IE/weather/hourbyhour/l/d71e95387799a552a061ec1550ac876dcc19b5d139adc6f51ba3b8bf7a6b96ed#detailIndex4'
URL5 = 'https://www.met.ie/weather-forecast/moroe-limerick#forecasts'
while True:
current_time = datetime.now()
timenow = datetime.now()
timenow = str(timenow)
current_time = str(current_time)
new_str = ""
x = 0
for i in range(0, len(current_time)):
if i != 4 and i != 7 and i != 10 and i != 13 and i != 16:
new_str = new_str + current_time[i]
new_str = float(new_str)
new_str = new_str / 100
new_str = round(new_str, 0)
if new_str % 2:
x = x + 1
else:
driver.get(URL1)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('accu1' + timenow + '.png')
driver.quit()
time.sleep(61)
'''
driver.get(URL2)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('accu2' + timenow + '.png')
driver.quit()
driver.get(URL3)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('weatherchannel1' + timenow + '.png')
driver.quit()
driver.get(URL4)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('weatherchannel2' + timenow + '.png')
driver.quit()
driver.get(URL5)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('meteireann' + timenow + '.png')
driver.quit()
'''
and here is the error message
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 169, in _new_conn
conn = connection.create_connection(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/connection.py", line 96, in create_connection
raise err
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/connection.py", line 86, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 61] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 200, in connect
conn = self._new_conn()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 181, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7fb4e73fd490>: Failed to establish a new connection: [Errno 61] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/hugophelan/Desktop/WeatherPiTest.py", line 39, in <module>
driver.get(URL1)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/request.py", line 78, in request
return self.request_encode_body(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/request.py", line 170, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/poolmanager.py", line 375, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/retry.py", line 573, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54000): Max retries exceeded with url: /session/3bdcabee5f314f620196394cfedd7079/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb4e73fd490>: Failed to establish a new connection: [Errno 61] Connection refused'))
I am trying to scrape content from a website but I am getting the below mentioned error
The method:
def scrape_newtimes():
"""Scrapes content from the NewTimes"""
url = 'https://www.newtimes.co.rw/'
r = requests.get(url, headers=HEADERS)
tree = fromstring(r.content)
links = tree.xpath('//div[#class="x-small-push clearfix"]/a/#href')
for link in links:
r = requests.get(link, headers=HEADERS)
blog_tree = fromstring(r.content)
paras = blog_tree.xpath('//div[#class="article-content"]/p')
para = extract_paratext(paras)
text = extract_text(para)
if not text:
continue
yield '"%s" %s' % (text, link)
The error I am getting:
>>> sc = scrape_newtimes()
>>> string_1 = next(sc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Projects\bird\bird-env\bot.py", line 58, in scrape_newtimes
r = requests.get(link, headers=HEADERS)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
>>>
The exception basically tells you what is wrong:
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
Or with line wrapping the line:
Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda':
No schema supplied. Perhaps you meant
http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
You link does not contain a complete URL
So basically this all stems from a previous question I had, so I'll post that question & my edit in its entirely below:
So I have a script I've been working with for a few days trying to get a list of emails from a csv I have, but now I've run into this roadblock. Here is the code:
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()
and when I run it I get this error:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 42, in parseAddress
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
File "/usr/lib/python3.5/re.py", line 213, in findall
return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
So I've figured out that I need to figure out how to encode the html string into bytes for the encoding, and Tyler's answer below helped me do so but now I'm getting this error:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 712, in create_connection
raise err
File "/usr/lib/python3.5/socket.py", line 703, in create_connection
sock.connect(sa)
OSError: [Errno 22] Invalid argument
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 22] Invalid argument>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'OSError' object is not subscriptable
Does this mean that one of the urls from the list isn't a valid url? I thought I had finally removed all fo the bad urls from my csv file but I may need to take another look
I want to extract the covers for different journals on the cambridge university press website. The I want to save it as it's online ISSN. The following code works but after one or two journals, it gives me this error:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\conne
ction.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 745, in getaddr
info
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 357, in _make_request
conn.request(method, url, **httplib_request_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1239, in r
equest
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1285, in _
send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1234, in e
ndheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1026, in _
send_output
self.send(msg)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 964, in se
nd
self.connect()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 166, in connect
conn = self._new_conn()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x030DB770>: Fai
led to establish a new connection: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 440, in send
timeout=timeout
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\retry
.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded with
url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<urlli
b3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno 110
04] getaddrinfo failed',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\Documents\Python\python_work\Kudos\CUPgetcovers.py", line 19, in <module>
f.write(requests.get("http://" + imagefound).content)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 508, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded w
ith url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<ur
llib3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno
11004] getaddrinfo failed',))
Process returned 1 (0x1) execution time : 4.373 s
Press any key to continue . . .
What am I doing wrong? I could not find any answers on google. It was working fine before.
Thank you in advance.
Edit:
launch.py:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import csv
import requests
from time import sleep
with open('listoflinks.csv', encoding="utf8") as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
http = httplib2.Http()
status, response = http.request(("https://www.cambridge.org" + row[0]))
soup = BeautifulSoup(response, "html.parser")
txt = (t.text for t in soup.find_all("span", class_="value"))
issn = next(t[:9] for t in txt if t.endswith("(Online)"))
for a in soup.find_all('a', attrs={'class' : 'image'}):
if a.img:
imagefound = (a.img['src'])
imagefound = imagefound[2:]
f = open((issn + ".jpg"),'wb')
f.write(requests.get("http://" + imagefound).content)
f.close()
listoflinks.csv:
/core/journals/journal-of-materials-research
/core/journals/journal-of-mechanics
/core/journals/journal-of-modern-african-studies
/core/journals/journal-of-navigation
/core/journals/journal-of-nutritional-science
/core/journals/journal-of-pacific-rim-psychology
/core/journals/journal-of-paleontology
/core/journals/journal-of-pension-economics-and-finance
/core/journals/journal-of-plasma-physics
/core/journals/journal-of-policy-history
/core/journals/journal-of-psychologists-and-counsellors-in-schools
/core/journals/journal-of-public-policy
/core/journals/journal-of-race-ethnicity-and-politics
/core/journals/journal-of-radiotherapy-in-practice
/core/journals/journal-of-relationships-research
/core/journals/journal-of-roman-archaeology
/core/journals/journal-of-roman-studies
/core/journals/journal-of-smoking-cessation
/core/journals/journal-of-social-policy
/core/journals/journal-of-southeast-asian-studies
/core/journals/journal-of-symbolic-logic
/core/journals/journal-of-the-american-philosophical-association
/core/journals/journal-of-the-australian-mathematical-society
/core/journals/journal-of-the-gilded-age-and-progressive-era
/core/journals/journal-of-the-history-of-economic-thought
/core/journals/journal-of-the-institute-of-mathematics-of-jussieu
/core/journals/journal-of-the-international-neuropsychological-society
/core/journals/journal-of-the-international-phonetic-association
/core/journals/journal-of-the-marine-biological-association-of-the-united-kingdom
/core/journals/journal-of-the-royal-asiatic-society
/core/journals/journal-of-the-society-for-american-music
/core/journals/journal-of-tropical-ecology
/core/journals/journal-of-tropical-psychology
/core/journals/journal-of-wine-economics
/core/journals/kantian-review
/core/journals/knowledge-engineering-review
/core/journals/language-and-cognition
/core/journals/language-in-society
/core/journals/language-teaching
/core/journals/language-variation-and-change
/core/journals/laser-and-particle-beams
/core/journals/latin-american-antiquity
/core/journals/latin-american-politics-and-society
/core/journals/law-and-history-review
/core/journals/legal-information-management
/core/journals/legal-studies
/core/journals/legal-theory
/core/journals/leiden-journal-of-international-law
/core/journals/libyan-studies
/core/journals/lichenologist
/core/journals/lms-journal-of-computation-and-mathematics
/core/journals/macroeconomic-dynamics
/core/journals/management-and-organization-review
/core/journals/mathematical-gazette
/core/journals/mathematical-proceedings-of-the-cambridge-philosophical-society
/core/journals/mathematical-structures-in-computer-science
/core/journals/mathematika
/core/journals/medical-history
/core/journals/medical-history-supplements
/core/journals/melanges-d-histoire-sociale
/core/journals/microscopy-and-microanalysis
/core/journals/microscopy-today
/core/journals/mineralogical-magazine
/core/journals/modern-american-history
/core/journals/modern-asian-studies
/core/journals/modern-intellectual-history
/core/journals/modern-italy
/core/journals/mrs-advances
/core/journals/mrs-bulletin
/core/journals/mrs-communications
/core/journals/mrs-energy-and-sustainability
/core/journals/mrs-online-proceedings-library-archive
/core/journals/nagoya-mathematical-journal
/core/journals/natural-language-engineering
/core/journals/netherlands-journal-of-geosciences
/core/journals/network-science
/core/journals/new-perspectives-on-turkey
/core/journals/new-surveys-in-the-classics
/core/journals/new-testament-studies
/core/journals/new-theatre-quarterly
/core/journals/nineteenth-century-music-review
/core/journals/nordic-journal-of-linguistics
/core/journals/numerical-mathematics-theory-methods-and-applications
/core/journals/nutrition-research-reviews
/core/journals/organised-sound
/core/journals/oryx
/core/journals/paleobiology
/core/journals/the-paleontological-society-papers
/core/journals/palliative-and-supportive-care
/core/journals/papers-of-the-british-school-at-rome
/core/journals/parasitology
/core/journals/parasitology-open
/core/journals/personality-neuroscience
/core/journals/perspectives-on-politics
/core/journals/philosophy
/core/journals/phonology
/core/journals/plainsong-and-medieval-music
/core/journals/plant-genetic-resources
/core/journals/polar-record
/core/journals/political-analysis
/core/journals/political-science-research-and-methods
/core/journals/politics-and-gender
/core/journals/politics-and-religion
/core/journals/politics-and-the-life-sciences
/core/journals/popular-music
/core/journals/powder-diffraction
/core/journals/prehospital-and-disaster-medicine
/core/journals/primary-health-care-research-and-development
/core/journals/probability-in-the-engineering-and-informational-sciences
/core/journals/proceedings-of-the-asil-annual-meeting
/core/journals/proceedings-of-the-edinburgh-mathematical-society
/core/journals/proceedings-of-the-international-astronomical-union
/core/journals/proceedings-of-the-nutrition-society
/core/journals/proceedings-of-the-prehistoric-society
/core/journals/proceedings-of-the-royal-society-of-edinburgh-section-a-mathematics
/core/journals/ps-political-science-and-politics
/core/journals/psychological-medicine
/core/journals/public-health-nutrition
/core/journals/publications-of-the-astronomical-society-of-australia
/core/journals/quarterly-reviews-of-biophysics
/core/journals/quaternary-research
/core/journals/queensland-review
/core/journals/radiocarbon
/core/journals/ramus
/core/journals/recall
/core/journals/religious-studies
/core/journals/renewable-agriculture-and-food-systems
/core/journals/review-of-international-studies
/core/journals/review-of-middle-east-studies
/core/journals/review-of-politics
/core/journals/review-of-symbolic-logic
/core/journals/revista-de-historia-economica-journal-of-iberian-and-latin-american-economic-history
/core/journals/robotica
/core/journals/royal-historical-society-camden-fifth-series
/core/journals/royal-institute-of-philosophy-supplements
/core/journals/rural-history
/core/journals/science-in-context
/core/journals/scottish-journal-of-theology
/core/journals/seed-science-research
/core/journals/slavic-review
/core/journals/social-philosophy-and-policy
/core/journals/social-policy-and-society
/core/journals/social-science-history
/core/journals/spanish-journal-of-psychology
/core/journals/studies-in-american-political-development
/core/journals/studies-in-church-history
/core/journals/studies-in-second-language-acquisition
/core/journals/tempo
/core/journals/theatre-research-international
/core/journals/theatre-survey
/core/journals/theory-and-practice-of-logic-programming
/core/journals/think
/core/journals/traditio
/core/journals/trans-trans-regional-and-national-studies-of-southeast-asia
/core/journals/transactions-of-the-royal-historical-society
/core/journals/transnational-environmental-law
/core/journals/twentieth-century-music
/core/journals/twin-research-and-human-genetics
/core/journals/urban-history
/core/journals/utilitas
/core/journals/victorian-literature-and-culture
/core/journals/visual-neuroscience
/core/journals/weed-science
/core/journals/weed-technology
/core/journals/wireless-power-transfer
/core/journals/world-politics
/core/journals/world-s-poultry-science-journal
/core/journals/world-trade-review
/core/journals/zygote
You should simplify your code and your scraping strategy, although I can see that not all journal pages have the same structure. On most pages you can get the ISSN easily through a form value. On others (free access, I think) you need to apply some kind of heuristics to get the ISSN. Also I don't know why you are using httplib2 and requests as both provide more or less the same functionality. Anyway here's some code that does what you want ... kind of (I have also removed the CSV code because as it is there's not need for that):
import requests
from bs4 import BeautifulSoup, SoupStrainer
with open('listoflinks.csv', encoding="utf8") as f:
for line in f:
path = line.strip()
print("getting", path)
response = requests.get("https://www.cambridge.org" + path)
soup = BeautifulSoup(response.text, "html.parser")
try:
issn = soup.find("input", attrs={'name': 'productIssn'}).get('value')
except:
values = soup.find_all("span", class_="value")
for v in values:
if "(Online)" in v.string:
issn = v.string.split(" ")[0]
break
print("issn:", issn)
details_container = soup.find("div", class_="details-container")
image = details_container.find("img")
imgurl = image['src'][2:]
print("imgurl:", imgurl)
with open(issn + ".jpg", 'wb') as output:
output.write(requests.get("http://" + imgurl).content)