Open multiple pages with python selenium - python

I'm trying to use python and selenium to loop through a list of webpages and download a file on each page. I am able to open one page at a time and download the first file I want with a while loop but as soon as I get to the second element in the list of webpages, selenium seems to error out.
Here is my code:
path_to_chromedriver = 'path to chromedriver location'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
browser.quit()
time.sleep(5)
I'm able to download the file from the first element in the array, which is www.google.com. The loop gets to the second list element www.yahoo.com but as soon as it gets to browser.get(url) that's where I run into this error:
Traceback (most recent call last):
File "trails_scraper.py", line 22, in <module>
browser.get(url)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in get
self.execute(Command.GET, {'url': url})
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 306, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 460, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 483, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1053, in request
self._send_request(method, url, body, headers)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1093, in _send_request
self.endheaders(body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1049, in endheaders
self._send_output(message_body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 893, in _send_output
self.send(msg)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 855, in send
self.connect()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 832, in connect
self.timeout, self.source_address)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 575, in create_connection
raise err
socket.error: [Errno 61] Connection refused
Does anyone know what is going on? I know the more error prone method is to use a for loop but logically my code seems correct.
Any help would be magnificently appreciated :)

So the problem is that you are declaring your browser out of the loop so, when the loop finish the 1 time it close the browser and if fail for your
browser.get(url)
Because there is any browser.
you have 2 solution:
1) you introduce the browser declaration inside of the loop
path_to_chromedriver = 'path to chromedriver location'
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
browser.quit()
time.sleep(5)
2) you close the browser just after the loop
path_to_chromedriver = 'path to chromedriver location'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
time.sleep(5)
browser.quit()

Related

I am scrping linkedin profiles but got the error

I want to scrape the linkedin profiles based on specific keywords but got the error here is my code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from tkinter import *
class Linkedin():
def getData(self):
driver = webdriver.Chrome('/home/danish-khan/scrapers/researchgate/chromedriver')
driver.get('https://www.linkedin.com/login')
driver.find_element_by_id('username').send_keys('danishkhankd237#gmail.com') #Enter username of linkedin account here
driver.find_element_by_id('password').send_keys('dankhanish446') #Enter Password of linkedin account here
driver.find_element_by_xpath("//button[#type='submit']").click()
#*********** Search Result ***************#
search_key = "data analyst" # Enter your Search key here to find people
key = search_key.split()
print('\nkeyword:', key)
keyword = ""
for key1 in key:
keyword = keyword + str(key1).capitalize() +"%20"
keyword = keyword.rstrip("%20")
print('\nkeyword2 :', keyword)
#global data
data = []
profile_links = []
for no in range(1,3):
start = "&page={}".format(no)
search_url = "https://www.linkedin.com/search/results/people/?keywords={}&origin=SUGGESTION{}".format(keyword,start)
driver.get(search_url)
# driver.maximize_window()
for scroll in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
search = BeautifulSoup(driver.page_source,'lxml')
for people in search.findAll('span', class_ = 't-16'):
profiles = people.find_all('a', attrs = {'class' : 'app-aware-link'})
count = 0
for i in profiles:
profiles2 = i['href']
print(profiles2)
profile_links.append(profiles2)
print("Going to scrape Page {} data".format(no))
print('\nprofile_links :', profile_links)
lent = 0
for people in profile_links:
#count = 0
# if count%2==0:
# lent+=1
print('Profile :', people)
driver.get(people)
print('\ngetting\n')
# #********** Profile Details **************#
card = BeautifulSoup(driver.page_source,'lxml')
try:
Name = card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text
except:
Name = 'None'
try:
Work_at = (card.find('div', attrs = {'class' : 'text-body-medium break-words'}).text).strip()
except:
Work_at = "None"
try:
Image = card.find("img", attrs = {'loading' : 'lazy'})['src']
except:
Image = 'None'
try:
Education = card.find('h3', attrs = {'class' : 'pv-entity__school-name t-16 t-black t-bold'}).text
except:
Education = 'None'
try:
Location = soup.find('span', attrs = {'class' : 'text-body-small inline t-black--light break-words'}).text.strip()
except:
Location = 'None'
details = {
'Name' : 'hgf', #card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text,
'Location' : '',
'Work_at' : '',
'Education' : '',
'Profile_image' : '',
'Website' : '',
'Email' : ''
}
details['Name'] = Name
print(details)
time.sleep(15)
driver.quit()
driver.quit()
def start(self):
self.getData()
if __name__ == "__main__":
obJH = Linkedin()
obJH.start()
firstly i want to collect all the url of the user profiles of specific kewyords like here data analyst and then go through all the profile urls to scrape specific data from these profiles but it only scrape two urls and not all the profiles urls and seconds when going through the list of urls i got the error:
python linkdn2.py
keyword: ['data', 'analyst']
keyword2 : Data%20Analyst
https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
getting
{'Name': 'Roshaan Khan', 'Location': '', 'Work_at': '', 'Education': '', 'Profile_image': '', 'Website': '', 'Email': ''}
https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck', 'https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 387, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/lib/python3.8/http/client.py", line 950, in send
self.connect()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 187, in connect
conn = self._new_conn()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "linkdn2.py", line 108, in <module>
obJH.start()
File "linkdn2.py", line 104, in start
self.getData()
File "linkdn2.py", line 55, in getData
driver.get(people)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 79, in request
return self.request_encode_body(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 171, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/poolmanager.py", line 330, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56707): Max retries exceeded with url: /session/b7431e8051979e6a9a308bdfd59bf60a/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused'))
I have tried many ways to solve this but can't find solution.

Python selenium - Failed to establish a new connection: [Errno 61] Connection refused

I am trying to get screenshots of a website for a project I am doing. However, when I run my code, it works the first time and gives me the screenshot. However, when the code loops to take another screenshot, a very long error message comes up saying that the connection has been refused. I am using python three and selenium on a macbook air
Here is my code
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36')
driver = webdriver.Chrome('/Library/Frameworks/Python.framework/Versions/3.8/bin/chromedriver', options=options)
URL1 = 'https://www.accuweather.com/en/ie/moroe/1079356/current-weather/1079356'
URL2 = 'https://www.accuweather.com/en/ie/moroe/1079356/hourly-weather-forecast/1079356'
URL3 = 'https://weather.com/en-IE/weather/today/l/d71e95387799a552a061ec1550ac876dcc19b5d139adc6f51ba3b8bf7a6b96ed'
URL4 = 'https://weather.com/en-IE/weather/hourbyhour/l/d71e95387799a552a061ec1550ac876dcc19b5d139adc6f51ba3b8bf7a6b96ed#detailIndex4'
URL5 = 'https://www.met.ie/weather-forecast/moroe-limerick#forecasts'
while True:
current_time = datetime.now()
timenow = datetime.now()
timenow = str(timenow)
current_time = str(current_time)
new_str = ""
x = 0
for i in range(0, len(current_time)):
if i != 4 and i != 7 and i != 10 and i != 13 and i != 16:
new_str = new_str + current_time[i]
new_str = float(new_str)
new_str = new_str / 100
new_str = round(new_str, 0)
if new_str % 2:
x = x + 1
else:
driver.get(URL1)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('accu1' + timenow + '.png')
driver.quit()
time.sleep(61)
'''
driver.get(URL2)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('accu2' + timenow + '.png')
driver.quit()
driver.get(URL3)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('weatherchannel1' + timenow + '.png')
driver.quit()
driver.get(URL4)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('weatherchannel2' + timenow + '.png')
driver.quit()
driver.get(URL5)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('meteireann' + timenow + '.png')
driver.quit()
'''
and here is the error message
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 169, in _new_conn
conn = connection.create_connection(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/connection.py", line 96, in create_connection
raise err
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/connection.py", line 86, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 61] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 200, in connect
conn = self._new_conn()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connection.py", line 181, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7fb4e73fd490>: Failed to establish a new connection: [Errno 61] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/hugophelan/Desktop/WeatherPiTest.py", line 39, in <module>
driver.get(URL1)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/request.py", line 78, in request
return self.request_encode_body(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/request.py", line 170, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/poolmanager.py", line 375, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 783, in urlopen
return self.urlopen(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/urllib3/util/retry.py", line 573, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54000): Max retries exceeded with url: /session/3bdcabee5f314f620196394cfedd7079/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb4e73fd490>: Failed to establish a new connection: [Errno 61] Connection refused'))

Python selenium selenium.common.exceptions.StaleElementReferenceException:

I have this weird error with selenium when I try to find sportsbook odds from oddsportal.com. It looks like selenium object does not work like a normal list and I can not just loop over every url. Test url that should work http://www.oddsportal.com/soccer/england/premier-league/ (Script is not written for home-draw-away odds).
So what am I doing wrong here?
My script:
from selenium import webdriver
from selenium.common.exceptions import NoSuchAttributeException,NoSuchElementException
from selenium.webdriver.common.keys import Keys
class Odds():
def odds(self,driver,url):
kertoimet = ['','']
driver.get(url)
odds = driver.find_elements_by_xpath("""//*[#id="odds-data table"]/div/table/tbody/tr""")
for item in odds:
data = item.text.replace(' ','').split('\n')
if data[0] == 'Pinnacle':
kertoimet = [data[1],data[2]]
return kertoimet
def odds_finder(self,data,driver):
for item in data:
if item.get_attribute('href') != '':
print(Odds().odds(driver,str(item.get_attribute('href'))))
def url_finder2(self,URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) #http://www.oddsportal.com/soccer/england/premier-league/
data = driver.find_elements_by_xpath("""//*[#id="tournamentTable"]/tbody/tr/td/a""")
Odds().odds_finder(list(data),driver)
Odds().url_finder2(URL)
Error:
Traceback (most recent call last):
File "odds.py", line 79, in <module>
Odds().url_finder2(open('oddsportal_odds.csv'))
File "odds.py", line 61, in url_finder2
Odds().odds_finder(list(data),driver)
File "odds.py", line 49, in odds_finder
if item.get_attribute('href') != '':
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 141, in
get_attribute
resp = self._execute(Command.GET_ELEMENT_ATTRIBUTE, {'name': name})
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 494, in
_execute
return self._parent.execute(command, params)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webdriver.py", line 236, in execute
self.error_handler.check_response(response)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/errorhandler.py", line 192, in
check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message:
stale element reference: element is not attached to the page document
(Session info: chrome=58.0.3029.110)
(Driver info: chromedriver=2.29.461585
(0be2cd95f834e9ee7c46bcc7cf405b483f5ae83b),platform=Mac OS X 10.12.3
x86_64)
You just need to call data again because the state gets changed.
Try modifying this 2 function.
def odds_finder(self,driver):
for item in driver.find_elements_by_xpath("//*[#id="tournamentTable"]/tbody/tr/td/a"):
time.sleep(5)
if item.get_attribute('href') != '':
print(Odds().odds(driver, str(item.get_attribute('href'))))
def url_finder2(self, URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) # http://www.oddsportal.com/soccer/england/premier-league/
Odds().odds_finder(driver)

Selenium 2.53.5 httplib.BadStatusLine: '' Python

Im trying to automate the registration of serial numbers in an online form using Selenium 2.53.5 in Python 2.7. The script has been working for 2+ months, but yesterday I started receiving an error right when I go to run it: httplib.BadStatusLine: ''. Is there any known fix for this? I've read that leading/trailing new line characters can mess up the retrieving of the url but I can't seem to identify the issue.
Code:
import sys
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SerialSet:
def __init__(self, fileName, driverPath, user, password):
self.fn = fileName
self.failedSerials = []
self.driver = webdriver.Chrome(driverPath)
self.aloSuccess = False
self.user = user
self.password = password
def parseSerialFile(self):
with open(self.fn, 'r') as f:
self.serials = [line.strip() for line in f]
def setCountrySN(self, serial, driver):
driver.find_element_by_xpath("//select/option[#value='USA']").click()
driver.find_element_by_id("serialno").send_keys(serial)
driver.find_element_by_xpath("//input[#value='Continue'][#type='button']").click()
def submitState(self, driver):
driver.find_element_by_xpath("//select/option[#value='CT']").click()
driver.find_element_by_id("Continue1").click()
def login(self, driver):
driver.find_element_by_xpath("//*[#id='accountname']").send_keys(self.user)
driver.find_element_by_xpath("//*[#id='accountpassword']").send_keys(self.password)
driver.find_element_by_xpath("//*[#id='signInHyperLink']").click()
def initiateSN(self, serial, driver):
# select country and enter serialno
driver.get("http://supportform.apple.com/201110/")
self.setCountrySN(serial, driver)
# enter login
time.sleep(3)
if driver.current_url == "http://supportform.apple.com/201110/":
return False
self.login(driver)
# select state and continue
time.sleep(3)
self.submitState(driver)
# final submit
time.sleep(3)
driver.find_element_by_id("finalContinue").click()
return True
def newSN(self, serial, driver):
# select country and enter serialno
driver.get("http://supportform.apple.com/201110/")
self.setCountrySN(serial, driver)
# select state and continue
time.sleep(3)
if driver.current_url == "http://supportform.apple.com/201110/":
return False
self.submitState(driver)
# final submit
time.sleep(3)
driver.find_element_by_id("finalContinue").click()
return True
def automateSerials(self):
for i in self.serials:
if self.aloSuccess == False:
if not self.initiateSN(i, self.driver):
self.failedSerials.append(i)
del i
else:
self.aloSuccess = True
else:
if not self.newSN(i, self.driver):
self.failedSerials.append(i)
del i
self.driver.quit()
print(str(len(self.serials) - len(self.failedSerials)) + ":" + str(len(self.serials)))
def main():
newSet = SerialSet(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
newSet.parseSerialFile()
newSet.automateSerials()
if __name__ == "__main__":
main()
Error:
Traceback (most recent call last):
File "automate.py", line 90, in <module>
main()
File "automate.py", line 85, in main
newSet = SerialSet(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
File "automate.py", line 11, in __init__
self.driver = webdriver.Chrome(driverPath)
File "/Library/Python/2.7/site- packages/selenium/webdriver/chrome/webdriver.py", line 67, in __init__
desired_capabilities=desired_capabilities)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 90, in __init__
self.start_session(desired_capabilities, browser_profile)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 177, in start_session
response = self.execute(Command.NEW_SESSION, capabilities)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 234, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 401, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 432, in _request
resp = self._conn.getresponse()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1132, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
Make sure your using the latest version chromedriver:
http://chromedriver.storage.googleapis.com/2.25/chromedriver_linux64.zip
I installed chromedriver 2.0 and got this error,
when I upgraded to 2.25 it got rid of this error.

Headless endless scroll selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,requests,unidecode,lxml,pdb
from pyvirtualdisplay import Display
from xvfbwrapper import Xvfb
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
print url #added in edit 1
vdisplay =Xvfb()
vdisplay.start()
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 240)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
print len(tweets) #added in edit 1
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
vdisplay.stop()
html_full=return_html_code(url)
Output:
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
39
56
74
I have the above code for endlessly scrolling a page in endless mode. But somehow it seems to stops before.
References-https://stackoverflow.com/a/31058403/3646408
Edit 1:
$ phantomjs --version
2.1.1
On runnning #alexce code it showed different output in 2 runs, the date check makes it clear that there are more tweets:
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
Date of most old tweet: 12 Jan 2016
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
287
303
317
337
356
373
388
400
418
437
457
476
492
Date of most old tweet: 8 Jan 2016
Edit2:
On runnning updated version of #alexce's code. It showed the below error after ~7000 tweets.
Traceback (most recent call last):
File "twitter_script.py", line 82, in <module>
search_twitter('Alcoholics Anonymous')
File "twitter_script.py", line 76, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 24, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code
html_full_source=driver.page_source
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source
return self.execute(Command.GET_PAGE_SOURCE)['value']
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute
response = self.command_executor.execute(driver_command, params)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute
return self._request(command_info[0], url, body=data)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request
resp = opener.open(request, timeout=self._timeout)
File "c:\Anaconda\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "c:\Anaconda\lib\urllib2.py", line 449, in _open
'_open', req)
File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open
r = h.getresponse(buffering=True)
File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse
response.begin()
File "c:\Anaconda\lib\httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "c:\Anaconda\lib\httplib.py", line 409, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "c:\Anaconda\lib\socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host
Edit 3:
Trying the same code for different url.
https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en
Traceback (most recent call last):
File "twitter_script.py", line 64, in <module>
search_twitter('Alcoholics Anonymous Drunk')
File "twitter_script.py", line 58, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 31, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen
Edit 4:
ubuntu#ip-172-31-38-123:~/social_network_extract_proxy$ cat error.txt
Traceback (most recent call last):
File "twitter_script.py", line 70, in <module>
search_twitter('alcoholics anonymous')
File "twitter_script.py", line 64, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 37, in get_twitter_data
html_full=return_html_code(url)
File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code
driver=webdriver.Firefox(firefox_profile=profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__
self.binary, timeout),
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__
self.binary.launch_browser(self.profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser
self._wait_until_connectable()
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable
% (self.profile.path))
selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details.
Got the above error after a while.
Here is a set of things that made it work for me in headless mode:
switch to PhantomJS
pretend to be a different browser by setting a custom User-Agent string
before scrolling into view of the last tweet, scroll to the top of the page (several times to increase reliability)
The code:
import time
def return_html_code(url):
dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print(number_of_tweets)
# move to the top and then to the bottom 5 times in a row
for _ in range(5):
driver.execute_script("window.scrollTo(0, 0)")
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
time.sleep(0.5)
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break

Categories