Im trying to automate the registration of serial numbers in an online form using Selenium 2.53.5 in Python 2.7. The script has been working for 2+ months, but yesterday I started receiving an error right when I go to run it: httplib.BadStatusLine: ''. Is there any known fix for this? I've read that leading/trailing new line characters can mess up the retrieving of the url but I can't seem to identify the issue.
Code:
import sys
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SerialSet:
def __init__(self, fileName, driverPath, user, password):
self.fn = fileName
self.failedSerials = []
self.driver = webdriver.Chrome(driverPath)
self.aloSuccess = False
self.user = user
self.password = password
def parseSerialFile(self):
with open(self.fn, 'r') as f:
self.serials = [line.strip() for line in f]
def setCountrySN(self, serial, driver):
driver.find_element_by_xpath("//select/option[#value='USA']").click()
driver.find_element_by_id("serialno").send_keys(serial)
driver.find_element_by_xpath("//input[#value='Continue'][#type='button']").click()
def submitState(self, driver):
driver.find_element_by_xpath("//select/option[#value='CT']").click()
driver.find_element_by_id("Continue1").click()
def login(self, driver):
driver.find_element_by_xpath("//*[#id='accountname']").send_keys(self.user)
driver.find_element_by_xpath("//*[#id='accountpassword']").send_keys(self.password)
driver.find_element_by_xpath("//*[#id='signInHyperLink']").click()
def initiateSN(self, serial, driver):
# select country and enter serialno
driver.get("http://supportform.apple.com/201110/")
self.setCountrySN(serial, driver)
# enter login
time.sleep(3)
if driver.current_url == "http://supportform.apple.com/201110/":
return False
self.login(driver)
# select state and continue
time.sleep(3)
self.submitState(driver)
# final submit
time.sleep(3)
driver.find_element_by_id("finalContinue").click()
return True
def newSN(self, serial, driver):
# select country and enter serialno
driver.get("http://supportform.apple.com/201110/")
self.setCountrySN(serial, driver)
# select state and continue
time.sleep(3)
if driver.current_url == "http://supportform.apple.com/201110/":
return False
self.submitState(driver)
# final submit
time.sleep(3)
driver.find_element_by_id("finalContinue").click()
return True
def automateSerials(self):
for i in self.serials:
if self.aloSuccess == False:
if not self.initiateSN(i, self.driver):
self.failedSerials.append(i)
del i
else:
self.aloSuccess = True
else:
if not self.newSN(i, self.driver):
self.failedSerials.append(i)
del i
self.driver.quit()
print(str(len(self.serials) - len(self.failedSerials)) + ":" + str(len(self.serials)))
def main():
newSet = SerialSet(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
newSet.parseSerialFile()
newSet.automateSerials()
if __name__ == "__main__":
main()
Error:
Traceback (most recent call last):
File "automate.py", line 90, in <module>
main()
File "automate.py", line 85, in main
newSet = SerialSet(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
File "automate.py", line 11, in __init__
self.driver = webdriver.Chrome(driverPath)
File "/Library/Python/2.7/site- packages/selenium/webdriver/chrome/webdriver.py", line 67, in __init__
desired_capabilities=desired_capabilities)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 90, in __init__
self.start_session(desired_capabilities, browser_profile)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 177, in start_session
response = self.execute(Command.NEW_SESSION, capabilities)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 234, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 401, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 432, in _request
resp = self._conn.getresponse()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1132, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
Make sure your using the latest version chromedriver:
http://chromedriver.storage.googleapis.com/2.25/chromedriver_linux64.zip
I installed chromedriver 2.0 and got this error,
when I upgraded to 2.25 it got rid of this error.
Related
I want to scrape the linkedin profiles based on specific keywords but got the error here is my code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from tkinter import *
class Linkedin():
def getData(self):
driver = webdriver.Chrome('/home/danish-khan/scrapers/researchgate/chromedriver')
driver.get('https://www.linkedin.com/login')
driver.find_element_by_id('username').send_keys('danishkhankd237#gmail.com') #Enter username of linkedin account here
driver.find_element_by_id('password').send_keys('dankhanish446') #Enter Password of linkedin account here
driver.find_element_by_xpath("//button[#type='submit']").click()
#*********** Search Result ***************#
search_key = "data analyst" # Enter your Search key here to find people
key = search_key.split()
print('\nkeyword:', key)
keyword = ""
for key1 in key:
keyword = keyword + str(key1).capitalize() +"%20"
keyword = keyword.rstrip("%20")
print('\nkeyword2 :', keyword)
#global data
data = []
profile_links = []
for no in range(1,3):
start = "&page={}".format(no)
search_url = "https://www.linkedin.com/search/results/people/?keywords={}&origin=SUGGESTION{}".format(keyword,start)
driver.get(search_url)
# driver.maximize_window()
for scroll in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
search = BeautifulSoup(driver.page_source,'lxml')
for people in search.findAll('span', class_ = 't-16'):
profiles = people.find_all('a', attrs = {'class' : 'app-aware-link'})
count = 0
for i in profiles:
profiles2 = i['href']
print(profiles2)
profile_links.append(profiles2)
print("Going to scrape Page {} data".format(no))
print('\nprofile_links :', profile_links)
lent = 0
for people in profile_links:
#count = 0
# if count%2==0:
# lent+=1
print('Profile :', people)
driver.get(people)
print('\ngetting\n')
# #********** Profile Details **************#
card = BeautifulSoup(driver.page_source,'lxml')
try:
Name = card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text
except:
Name = 'None'
try:
Work_at = (card.find('div', attrs = {'class' : 'text-body-medium break-words'}).text).strip()
except:
Work_at = "None"
try:
Image = card.find("img", attrs = {'loading' : 'lazy'})['src']
except:
Image = 'None'
try:
Education = card.find('h3', attrs = {'class' : 'pv-entity__school-name t-16 t-black t-bold'}).text
except:
Education = 'None'
try:
Location = soup.find('span', attrs = {'class' : 'text-body-small inline t-black--light break-words'}).text.strip()
except:
Location = 'None'
details = {
'Name' : 'hgf', #card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text,
'Location' : '',
'Work_at' : '',
'Education' : '',
'Profile_image' : '',
'Website' : '',
'Email' : ''
}
details['Name'] = Name
print(details)
time.sleep(15)
driver.quit()
driver.quit()
def start(self):
self.getData()
if __name__ == "__main__":
obJH = Linkedin()
obJH.start()
firstly i want to collect all the url of the user profiles of specific kewyords like here data analyst and then go through all the profile urls to scrape specific data from these profiles but it only scrape two urls and not all the profiles urls and seconds when going through the list of urls i got the error:
python linkdn2.py
keyword: ['data', 'analyst']
keyword2 : Data%20Analyst
https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
getting
{'Name': 'Roshaan Khan', 'Location': '', 'Work_at': '', 'Education': '', 'Profile_image': '', 'Website': '', 'Email': ''}
https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck', 'https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 387, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/lib/python3.8/http/client.py", line 950, in send
self.connect()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 187, in connect
conn = self._new_conn()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "linkdn2.py", line 108, in <module>
obJH.start()
File "linkdn2.py", line 104, in start
self.getData()
File "linkdn2.py", line 55, in getData
driver.get(people)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 79, in request
return self.request_encode_body(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 171, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/poolmanager.py", line 330, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56707): Max retries exceeded with url: /session/b7431e8051979e6a9a308bdfd59bf60a/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused'))
I have tried many ways to solve this but can't find solution.
I'm trying to write a small python 3 utility script that checks to see if a file exists on my server.
So I have the code below that has a big array of string values that I pass to a simple function that returns the url and the response code.
However, when I run it I get all these errors I don't even know where to start:
$ python ReturnPath.py
Traceback (most recent call last):
File "ReturnPath.py", line 86, in <module>
checkResponse(u)
File "ReturnPath.py", line 5, in checkResponse
code = urllib.request.urlopen(url).getcode()
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 510, in open
req = Request(fullurl, data)
File "C:\Program Files\Python37\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Program Files\Python37\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Program Files\Python37\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg"'
Here is my code:
import urllib.request
def checkResponse(url):
code = urllib.request.urlopen(url).getcode()
print(url + " = " + code)
return
arrCases = []
arrCases.extend([
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
for u in arrCases:
checkResponse(u)
What am I doing wrong?
You have to catch errors from broken URLs. I also increased speed through multiprocessing.Pool.
import urllib.request
from urllib.error import HTTPError, URLError
import multiprocessing
def checkResponse(url):
try:
code = urllib.request.urlopen(url, timeout=1).getcode()
except (HTTPError, URLError) as error:
print(url, " = ", error)
else:
print(url, " = ", code)
return
arrCases = []
arrCases.extend([
"https://i.stack.imgur.com/DsNOB.jpg",
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
with multiprocessing.Pool(processes=4) as pool:
pool.map(checkResponse, arrCases)
I'm trying to use python and selenium to loop through a list of webpages and download a file on each page. I am able to open one page at a time and download the first file I want with a while loop but as soon as I get to the second element in the list of webpages, selenium seems to error out.
Here is my code:
path_to_chromedriver = 'path to chromedriver location'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
browser.quit()
time.sleep(5)
I'm able to download the file from the first element in the array, which is www.google.com. The loop gets to the second list element www.yahoo.com but as soon as it gets to browser.get(url) that's where I run into this error:
Traceback (most recent call last):
File "trails_scraper.py", line 22, in <module>
browser.get(url)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in get
self.execute(Command.GET, {'url': url})
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 306, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 460, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Python/2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 483, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1053, in request
self._send_request(method, url, body, headers)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1093, in _send_request
self.endheaders(body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1049, in endheaders
self._send_output(message_body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 893, in _send_output
self.send(msg)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 855, in send
self.connect()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 832, in connect
self.timeout, self.source_address)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 575, in create_connection
raise err
socket.error: [Errno 61] Connection refused
Does anyone know what is going on? I know the more error prone method is to use a for loop but logically my code seems correct.
Any help would be magnificently appreciated :)
So the problem is that you are declaring your browser out of the loop so, when the loop finish the 1 time it close the browser and if fail for your
browser.get(url)
Because there is any browser.
you have 2 solution:
1) you introduce the browser declaration inside of the loop
path_to_chromedriver = 'path to chromedriver location'
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
browser.quit()
time.sleep(5)
2) you close the browser just after the loop
path_to_chromedriver = 'path to chromedriver location'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get("file:///path to html file")
#these are example webpages
all_trails = ['www.google.com', 'www.yahoo.com', 'www.bing.com']
index = 0
while (index <= 2):
url = all_trails[index]
browser.get(url)
browser.find_element_by_link_text('Sign In').click()
username = browser.find_element_by_xpath("//input[#placeholder='Log
in with email']")
password = browser.find_element_by_name('pass')
username.send_keys("username")
password.send_keys("password")
browser.find_element_by_xpath("//button[#type='submit' and
#class='btn btn-primary btn-lg' and contains(text(), 'Log
In')]").click()
results_url = browser.find_element_by_xpath("//a[#class='require-
user' and contains(text(), 'GPX File')]").click()
index += 1
time.sleep(5)
browser.quit()
I have this weird error with selenium when I try to find sportsbook odds from oddsportal.com. It looks like selenium object does not work like a normal list and I can not just loop over every url. Test url that should work http://www.oddsportal.com/soccer/england/premier-league/ (Script is not written for home-draw-away odds).
So what am I doing wrong here?
My script:
from selenium import webdriver
from selenium.common.exceptions import NoSuchAttributeException,NoSuchElementException
from selenium.webdriver.common.keys import Keys
class Odds():
def odds(self,driver,url):
kertoimet = ['','']
driver.get(url)
odds = driver.find_elements_by_xpath("""//*[#id="odds-data table"]/div/table/tbody/tr""")
for item in odds:
data = item.text.replace(' ','').split('\n')
if data[0] == 'Pinnacle':
kertoimet = [data[1],data[2]]
return kertoimet
def odds_finder(self,data,driver):
for item in data:
if item.get_attribute('href') != '':
print(Odds().odds(driver,str(item.get_attribute('href'))))
def url_finder2(self,URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) #http://www.oddsportal.com/soccer/england/premier-league/
data = driver.find_elements_by_xpath("""//*[#id="tournamentTable"]/tbody/tr/td/a""")
Odds().odds_finder(list(data),driver)
Odds().url_finder2(URL)
Error:
Traceback (most recent call last):
File "odds.py", line 79, in <module>
Odds().url_finder2(open('oddsportal_odds.csv'))
File "odds.py", line 61, in url_finder2
Odds().odds_finder(list(data),driver)
File "odds.py", line 49, in odds_finder
if item.get_attribute('href') != '':
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 141, in
get_attribute
resp = self._execute(Command.GET_ELEMENT_ATTRIBUTE, {'name': name})
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 494, in
_execute
return self._parent.execute(command, params)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webdriver.py", line 236, in execute
self.error_handler.check_response(response)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/errorhandler.py", line 192, in
check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message:
stale element reference: element is not attached to the page document
(Session info: chrome=58.0.3029.110)
(Driver info: chromedriver=2.29.461585
(0be2cd95f834e9ee7c46bcc7cf405b483f5ae83b),platform=Mac OS X 10.12.3
x86_64)
You just need to call data again because the state gets changed.
Try modifying this 2 function.
def odds_finder(self,driver):
for item in driver.find_elements_by_xpath("//*[#id="tournamentTable"]/tbody/tr/td/a"):
time.sleep(5)
if item.get_attribute('href') != '':
print(Odds().odds(driver, str(item.get_attribute('href'))))
def url_finder2(self, URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) # http://www.oddsportal.com/soccer/england/premier-league/
Odds().odds_finder(driver)
I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()