I tried to extract data from below site but I don't know how to put the xpath in the loop "for", because the loop needs to be convert xpath to str, could you do me a favor and help me:
Site: https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/imp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-01-01&r9=2022-05-01
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome('C:\Webdriver\chromedriver.exe')
driver.get('https://www150.statcan.gc.ca/n1/pub/71-607-x/71-607-x2021004-eng.htm')
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[#id="cimt_import"]/p[1]/a')
# clicking on the button
button.click()
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[#id="topic3s"]')
# clicking on the button
button.click()
time.sleep(2)
# finding the start year:2022 from scroll
element_drop_down_startYear = driver.find_element_by_xpath('//*[#id="fromYear"]/option[1]')
element_drop_down_startYear.click()
# finding the start month from:January scroll
element_drop_down_startMonth = driver.find_element_by_xpath('//*[#id="fromMonth"]/option[1]')
element_drop_down_startMonth.click()
# finding the End year from scroll
element_drop_down_endYear = driver.find_element_by_xpath('//*[#id="toYear"]/option[1]')
element_drop_down_endYear.click()
# finding the End month from scroll
element_drop_down_endmonth = driver.find_element_by_xpath('//*[#id="toMonth"]/option[5]')
element_drop_down_endmonth.click()
# finding the specific Chapter
element_drop_down_specificChapter = driver.find_element_by_xpath('//*[#id="report_hs"]/option[1]')
element_drop_down_specificChapter.click()
time.sleep(1)
# finding the specific Commodity from the list
element_drop_down_specific_commodity = driver.find_element_by_xpath('//*[#id="report_hs"]/option[2]')
element_drop_down_specific_commodity.click()
# finding the specific Commodity from the list
element_drop_down_specific_button= driver.find_element_by_xpath('//*[#id="report"]/div[1]/div[3]/div[5]/p[2]/button')
element_drop_down_specific_button.click()
#--------------------------------------------------------------------
cel = 1
for cel in rane(25):
x = driver.find_element_by_xpath('//*[#id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[4]")
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[7]")
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[8]/abbr")
time.sleep(3)
You need to find the element before printing it, otherwise you're printing a string. I think what you want to do is in each iteration of the for loop print those selectors? if so find the elements like so, then print them.
for i in range(25):
x = driver.find_element_by_xpath('//*[#id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
element_1 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[4]')
element_2 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[7]')
element_3 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[8]/abbr')
If you inspect the Network tab, you can see that webpage is pulling the table data from
https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01
Scrape that json page instead:
import requests
r = requests.get('https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01')
print(r.json())
Related
I want to retrieve from the link below the first page can be retrieve but I have a problem for putting the loop for the next page till the end. May you help me and complete my code?
My link is:
https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01
from selenium import webdriver
import time
url = "https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01"
driver = webdriver.Chrome("C:\Program Files\Python310\chromedriver.exe")
driver.get(url)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
it brings the first page data but I don't know how to retrieve the others.
Look for the next-page selector and iterate over it, if it is there, let it click after your extraction part. You wanna do that in a while loop for example which you break if selector can't be found.
from selenium import webdriver
import time
url = "https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01"
driver = webdriver.Chrome("C:\Program Files\Python310\chromedriver.exe")
driver.get(url)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
while True:
next_page = driver.find_element(By.XPATH, '//a[#id="report_results_next"]')
if next_page:
# steps to extract if next_page
driver.get(next_page)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
else:
# stop
break
This is not tested.
What I mean is that the website I'm using has 2 dropmenus named province with the exact same id, so how do I tell python which dropmenu in particular I wanna select. Of course this is assuming that the issue is that python always picks the first id it sees
from selenium import webdriver
from selenium.webdriver.support.ui import Select
# There are two dropmenu with the same xpath. first time it works fine
# 2nd time it throws an error about element not interactable
Prov = Select(web.find_element_by_xpath('//*[#id="province"]'))
Prov.select_by_index(2)
def Start():
# once opened it will fill in the confirm your age
Day = Select(web.find_element_by_xpath('//*[#id="bday_day"]'))
Day.select_by_index(2)
Month = Select(web.find_element_by_xpath('//*[#id="bday_month"]'))
Month.select_by_index(4)
Month = Select(web.find_element_by_xpath('//*[#id="bday_year"]'))
Month.select_by_index(24)
Prov = Select(web.find_element_by_xpath('//*[#id="province"]'))
Prov.select_by_index(5)
Button = web.find_element_by_xpath('//*[#id="popup-subscribe"]/button')
Button.click()
# have to go through select your birthday
Start()
# 2 seconds is enough for the website to load
time.sleep(2)
# this throws and error.
Prov = Select(web.find_element_by_xpath('//*[#id="province"]'))
Prov.select_by_index(5)
Selenium has functions
find_element_by_... without s in word element to get only first element
find_elements_by_... with s in word elements to get all elements
Selenium doc: 4. Locating Elements¶
So you can get all elements as list (even if there is only one element in HTML)
(if there is no elements then you get empty list)
elements = web.find_elements_by_xpath('//*[#id="province"]')
and later slice it
first = elements[0]
second = elements[1]
last = elements[-1]
list_first_and_second = elements[:1]
EDIT:
You can also try to slice directly in xpath like
(it starts counting at one, not zero)
'//*[#id="province"][2]'
or maybe
'(//*[#id="province"])[2]'
but I never used it to confirm if it will work.
BTW:
All ID should have unique names - you shouldn't duplicate IDs.
If you check documentation 4. Locating Elements¶ then you see that there is find_element_by_id without char s in word element - to get first and the only element with some ID - but there is no find_elements_by_id with char s in word elements - to get more then one element with some ID.
EDIT:
Minimal working code with example HTML in code
from selenium import webdriver
from selenium.webdriver.support.ui import Select
html = '''
<select id="province">
<option value="value_A">A</options>
<option value="value_B">B</options>
</select>
<select id="province">
<option value="value_1">1</options>
<option value="value_2">2</options>
</select>
'''
driver = webdriver.Firefox()
driver.get("data:text/html;charset=utf-8," + html)
all_elements = driver.find_elements_by_xpath('//*[#id="province"]')
first = all_elements[0]
second = all_elements[1]
prov1 = Select(first)
prov2 = Select(second)
print('--- first ---')
for item in prov1.options:
print('option:', item.text, item.get_attribute('value'))
for item in prov1.all_selected_options:
print('selected:', item.text, item.get_attribute('value'))
print('--- second ---')
for item in prov2.options:
print('option:', item.text, item.get_attribute('value'))
for item in prov2.all_selected_options:
print('selected:', item.text, item.get_attribute('value'))
EDIT:
There are two province.
When you use find_element in Start then you get first province in popup - and you can fill it. When you click button then it closes this popup but it doesn't remove first province from HTML - it only hide it.
Later when you use find_element you get again first province in hidden popup - and this time it is not visible and it can't use it - and this gives error. You have to use second province like in this example.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
def Start():
# once opened it will fill in the confirm your age
Day = Select(web.find_element_by_xpath('//*[#id="bday_day"]'))
Day.select_by_index(2)
Month = Select(web.find_element_by_xpath('//*[#id="bday_month"]'))
Month.select_by_index(4)
Month = Select(web.find_element_by_xpath('//*[#id="bday_year"]'))
Month.select_by_index(24)
# it uses first `province`
Prov = Select(web.find_element_by_xpath('//*[#id="province"]'))
Prov.select_by_index(5)
Button = web.find_element_by_xpath('//*[#id="popup-subscribe"]/button')
Button.click()
web = webdriver.Firefox()
web.get('https://www.tastyrewards.com/en-ca/contest/fritolaycontest/participate')
# have to go through select your birthday
Start()
# 2 seconds is enough for the website to load
time.sleep(2)
# `find_elements` with `s` - to get second `province`
all_province = web.find_elements_by_xpath('//*[#id="province"]')
second_province = all_province[1]
Prov = Select(second_province)
Prov.select_by_index(5)
I am trying to scrape data from a website that has a multilevel drop-down menu every time an item is selected it changes the sub items for sub drop-downs.
problem is that for every loop it extracts same sub items from the drop down items. the selection happens but it do not update the items on behalf of new selection from loop
can any one help me why I am not getting the desired results.
Perhaps this is because my drop-down list is in java Script or something.
for instance like this manue in the picture below:
i have gone this far:
enter code here
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import csv
//#from selenium.webdriver.support import Select
import time
print ("opening chorome....")
driver = webdriver.Chrome()
driver.get('https://www.wheelmax.com/')
time.sleep(10)
csvData = ['Year', 'Make', 'Model', 'Body', 'Submodel', 'Size']
//#variables
yeart = []
make= []
model=[]
body = []
submodel = []
size = []
Yindex = Mkindex = Mdindex = Bdindex = Smindex = Sindex = 0
print ("waiting for program to set variables....")
time.sleep(20)
print ("initializing and setting variables....")
//#initializing Year
Year = Select(driver.find_element_by_id("icm-years-select"))
Year.select_by_value('2020')
yr = driver.find_elements(By.XPATH, '//*[#id="icm-years-select"]')
time.sleep(15)
//#initializing Make
Make = Select(driver.find_element_by_id("icm-makes-select"))
Make.select_by_index(1)
mk = driver.find_elements(By.XPATH, '//*[#id="icm-makes-select"]')
time.sleep(15)
//#initializing Model
Model = Select(driver.find_element_by_id("icm-models-select"))
Model.select_by_index(1)
mdl = driver.find_elements(By.XPATH, '//*[#id="icm-models-select"]')
time.sleep(15)
//#initializing body
Body = Select(driver.find_element_by_id("icm-drivebodies-select"))
Body.select_by_index(1)
bdy = driver.find_elements(By.XPATH, '//*[#id="icm-drivebodies-select"]')
time.sleep(15)
//#initializing submodel
Submodel = Select(driver.find_element_by_id("icm-submodels-select"))
Submodel.select_by_index(1)
sbm = driver.find_elements(By.XPATH, '//*[#id="icm-submodels-select"]')
time.sleep(15)
//#initializing size
Size = Select(driver.find_element_by_id("icm-sizes-select"))
Size.select_by_index(0)
siz = driver.find_elements(By.XPATH, '//*[#id="icm-sizes-select"]')
time.sleep(5)
Cyr = Cmk = Cmd = Cbd = Csmd = Csz = ""
print ("fetching data from variables....")
for y in yr:
obj1 = driver.find_element_by_id("icm-years-select")
Year = Select(obj1)
Year.select_by_index(++Yindex)
obj1.click()
#obj1.click()
yeart.append(y.text)
Cyr = y.text
time.sleep(10)
for m in mk:
obj2 = driver.find_element_by_id("icm-makes-select")
Make = Select(obj2)
Make.select_by_index(++Mkindex)
obj2.click()
#obj2.click()
make.append(m.text)
Cmk = m.text
time.sleep(10)
for md in mdl:
Mdindex =0
obj3 = driver.find_element_by_id("icm-models-select")
Model = Select(obj3)
Model.select_by_index(++Mdindex)
obj3.click()
#obj3.click(clickobj)
model.append(md.text)
Cmd = md.text
time.sleep(10)
Bdindex = 0
for bd in bdy:
obj4 = driver.find_element_by_id("icm-drivebodies-select")
Body = Select(obj4)
Body.select_by_index(++Bdindex)
obj4.click()
#obj4.click(clickobj2)
body.append(bd.text)
Cbd = bd.text
time.sleep(10)
Smindex = 0
for sm in sbm:
obj5 = driver.find_element_by_id("icm-submodels-select")
Submodel = Select(obj5)
obj5.click()
Submodel.select_by_index(++Smindex)
#obj5.click(clickobj5)
submodel.append(sm.text)
Csmd = sm.text
time.sleep(10)
Sindex = 0
for sz in siz:
Size = Select(driver.find_element_by_id("icm-sizes-select"))
Size.select_by_index(++Sindex)
size.append(sz.text)
Scz = sz.text
csvData += [Cyr, Cmk, Cmd, Cbd,Csmd, Csz]
Because of https://www.wheelmax.com has multilevel drop-down menu dependent on each other for example if you select Select Year drop down option, after selected year based on Select Make drop down is enable and display option based on the selected year option.
So basically you need to use Selenium package for handle dynamic option.
Install selenium web driver as per your browser
Download chrome web driver :
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
unzip ~/Downloads/chromedriver_linux64.zip -d ~/Downloads
chmod +x ~/Downloads/chromedriver
sudo mv -f ~/Downloads/chromedriver /usr/local/share/chromedriver
sudo ln -s /usr/local/share/chromedriver /usr/local/bin/chromedriver
sudo ln -s /usr/local/share/chromedriver /usr/bin/chromedriver
selenium tutorial
https://selenium-python.readthedocs.io/
Eg. using selenium to select multiple dropdown options
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
driver = webdriver.Chrome()
driver.get('https://www.wheelmax.com/')
time.sleep(4)
selectYear = Select(driver.find_element_by_id("icm-years-select"))
selectYear.select_by_value('2019')
time.sleep(2)
selectMakes = Select(driver.find_element_by_id("icm-makes-select"))
selectMakes.select_by_value('58')
Update:
select drop down option value or count total options
for option in selectYear.options:
print(option.text)
print(len(selectYear.options))
Se more
How to extract data from a dropdown menu using python beautifulsoup
The page does a callback to populate with years. Simply mimic that.
If you actually need to change years and select from dependent drop downs, which becomes a different question, you need browser automation e.g. selenium, or to manually perform this and inspect network tab to see if there is an xhr request you can mimic to submit your choices.
import requests
r = requests.get('https://www.iconfigurators.com/json2/?returnType=json&bypass=true&id=13898&callback=yearObj').json()
years = [item['year'] for item in r['years']]
print(years)
I guess the reason you can't parse the years with beautiful soup is because the 'select' tag containing the 'option' tags with all the years is not present yet/is hidden at the moment when beautiful soup downloads the page. It is added to the DOM by executing additional JavaScript I assume. If you look at the DOM of the loaded page using the developer tools of your browser, for example F12 for Mozilla, you'll see that the tag containing the information you look for is: <select id="icm-years-select"">. If you try to parse for this tag with the object downloaded with beautiful soup, you get an empty list of tag objects:
from bs4 import BeautifulSoup
from requests import get
response = get('https://www.wheelmax.com/')
yourSoup = BeautifulSoup(response.text, "lxml")
print(len(yourSoup.select('div #vehicle-search'))) // length = 1 -> visible
print()
print(len(yourSoup.select('#icm-years-select'))) // length = 0 -> not visible
So if you want to get the years by using Python by all means, I guess you might try to click on the respective tag and then parse again using some combination of requests/beautiful soup/ or the selenium module which will require a bit more digging :-)
Otherwise if you just quickly need the years parsed, use JavaScript:
countYears = document.getElementById('icm-years-select').length;
yearArray = [];
for (i = 0; i < countYears; i++) {yearArray.push(document.getElementById('icm-years-select')[i].value)};
I'm scraping this website using Python and Selenium. I have the code working but it currently only scrapes the first page, I would like to iterate through all the pages and scrape them all but they handle pagination in a weird way how would I go through the pages and scrape them one by one?
Pagination HTML:
<div class="pagination">
First
Prev
1
<span class="current">2</span>
3
4
Next
Last
</div>
Scraper:
import re
import json
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options,
executable_path=r'/Users/weaabduljamac/Downloads/chromedriver')
url = 'https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList'
driver.get(url)
def getData():
data = []
rows = driver.find_element_by_xpath('//*[#id="form1"]/table/tbody').find_elements_by_tag_name('tr')
for row in rows:
app_number = row.find_elements_by_tag_name('td')[1].text
address = row.find_elements_by_tag_name('td')[2].text
proposals = row.find_elements_by_tag_name('td')[3].text
status = row.find_elements_by_tag_name('td')[4].text
data.append({"CaseRef": app_number, "address": address, "proposals": proposals, "status": status})
print(data)
return data
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
all_data.extend( getData() )
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
if __name__ == "__main__":
main()
Before moving on to automating any scenario, always write down the manual steps you would perform to execute the scenario. Manual steps for what you want to (which I understand from the question) is -
1) Go to site - https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList
2) Select first week option
3) Click search
4) Get the data from every page
5) Load the url again
6) Select second week option
7) Click search
8) Get the data from every page
.. and so on.
You are having a loop to select different weeks but inside each loop iteration for the week option, you also need to include a loop to iterate over all the pages. Since you are not doing that, your code is returning only the data from the first page.
Another problem is with how you are locaing the 'Next' button -
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
You are selecting the 4th <a> element which is ofcourse not robust because in different pages, the Next button's index will be different. Instead, use this better locator -
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
Logic for creating loop which will iterate through pages -
First you will need the number of pages. I did that by locating the <a> immediately before the "Next" button. As per the screenshot below, it is clear that the text of this element will be equal to the number of pages -
-
I did that using following code -
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
Now once you have number of pages as number_of_pages, you only need to click "Next" button number_of_pages - 1 times!
Final code for your main function-
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
for j in range(number_of_pages - 1):
all_data.extend(getData())
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
time.sleep(1)
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
Following approach is simply worked for me.
driver.find_element_by_link_text("3").click()
driver.find_element_by_link_text("4").click()
....
driver.find_element_by_link_text("Next").click()
first get the total pages in the pagination, using
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,1')
ins.find_element_by_class_name("pagination")
source = BeautifulSoup(ins.page_source)
div = source.find_all('div', {'class':'pagination'})
all_as = div[0].find_all('a')
total = 0
for i in range(len(all_as)):
if 'Next' in all_as[i].text:
total = all_as[i-1].text
break
Now just loop through the range
for i in range(total):
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,{}'.format(count))
keep incrementing the count and get the source code for the page and then get the data for it.
Note: Don't forget the sleep when clicking on going form one page to another
I have been trying to write a program to scrape the statistics from www.whoscored.com and create a pandas dataframe.
I have updated the code with the help of crookedleaf and this is the working code:
import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
summary_stats = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-summary"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="currentPage"]').get_attribute('value')
print('Page ' + page_number)
df1 = read_html(table_html)[0]
summary_stats = pd.concat([summary_stats, df1])
next_link = driver.find_element_by_xpath('//*[#id="next"]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(summary_stats)
driver.close()
Now I am trying to gather the stats from the other tabs. I am really close, but the code is not exiting the loop when it should be breaking out of it. Here is the code below:
defensive_button = driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]/li[2]/a')
defensive_button.click()
defensive_stats = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-defensive"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-defensive"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="statistics-paging-defensive"]/div/input[1]').get_attribute('value')
print('Page ' + page_number)
df2 = read_html(table_html)[0]
defensive_stats = pd.concat([defensive_stats, df2])
next_link = driver.find_element_by_xpath('//*[#id="statistics-paging-defensive"]/div/dl[2]/dd[3]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(defensive_stats)
This code loops through all the pages, but then keeps looping through the last page
You are defining your table's code outside of your loop. You are navigating to the next page, but not redefining your table and table_html elements. move them to the first lines after while True
EDIT: After making the changes to your code, my guess is due to the dynamically loaded content of the table, you it is unable to process the changes or unable to get the content due to the "loading" graphic overlay. Another thing is there may not always be 30 pages. Today, for example, there's 29, so it continuously gets the data from page 29. I modified your code to be keep running until the "next" button is no longer enabled, and i put in a wait that checks to see if the table is loading before continuing:
import time
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Chrome(path-to-your-chromedriver)
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
df = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-summary"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="currentPage"]').get_attribute('value')
print('Page ' + page_number)
df1 = read_html(table_html)[0]
df.append(df1)
next_link = driver.find_element_by_xpath('//*[#id="next"]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(df)
driver.close()
However, i am getting an empty DataFrame at the end of running this. I'm unfortunately not familiar enough with pandas to identify the issue, but it is related to df.append(). I ran this through with it printing the value of df1 at each loop, and it prints the correct data, however it does not add it to the DataFrame. This may be something you are familiar enough with to implement the changes needed to run it completely.
EDIT 2: took me a while to figure this one out. essentially, the page's content is being dynamically loaded with javascript. the 'next' element you are declaring is still the first 'next' button you come across. each time you click a new tab, the amount of 'next' elements is increased. i have added in an edit that sucessfully navigates across all tabs (except the 'detailed' tab... hopefully you don't need this one lol). i, however, am still getting empty DataFrame()'s
import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Chrome('/home/mdrouin/Downloads/chromedriver')
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
statistics = { # this is a list of all the tabs on the page
'summary': DataFrame(),
'defensive': DataFrame(),
'offensive': DataFrame(),
'passing': DataFrame()
}
count = 0
tabs = driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]').find_elements_by_tag_name('li') # this pulls all the tab elements
for tab in tabs[:-1]: # iterate over the different tab sections
section = tab.text.lower()
driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]').find_element_by_link_text(section.title()).click() # clicks the actual tab by using the dictionary's key (.proper() makes the first character in the string uppercase)
time.sleep(3)
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-%s"]' % section).get_attribute('class') == 'is-updating': # string formatting on the xpath to change for each section that is iterated over
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-%s"]' % section) # string formatting on the xpath to change for each section that is iterated over
table_html = table.get_attribute('innerHTML')
df = read_html(table_html)[0]
# print df
pd.concat([statistics[section], df])
next_link = driver.find_elements_by_xpath('//*[#id="next"]')[count] # makes sure it's selecting the correct index of 'next' items
if 'disabled' in next_link.get_attribute('class'):
break
time.sleep(5)
next_link.click()
count += 1
for df in statistics.values(): # iterates over the DataFrame() elemnts
print df
driver.quit()