I tried debugging my program with print statements to see what was going on during each iteration.
This part works fine:
The program goes through a total of 50 combinations of the drop-down menus (25 for each year).
This part isn't working:
However, for some reason the totals dictionary is only storing the inputs from the second iteration of the initial "year" for-loop. It is returning a dictionary with a length of 25 (only half of what I actually want).
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
# General Stuff about the website
path = '/Users/admin/desktop/projects/scraper/chromedriver'
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=path)
website = 'http://siops.datasus.gov.br/filtro_rel_ges_covid_municipal.php'
driver.get(website)
# Initial Test: printing the title
print(driver.title)
print()
# Dictionary to Store stuff in
totals = {}
### Drop Down Menus ###
state_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbUF"]'))
state_options = state_select.options
year_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbAno"]'))
year_options = year_select.options
# county_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbMunicipio"]'))
# county_select.select_by_value('120025')
# report_select = Select(driver.find_element(By.XPATH, '//*[#id="gesRelatorio"]'))
# report_select.select_by_value('rel_ges_covid_rep_uniao_municipal.php')
# period_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbPeriodo"]'))
# period_select.select_by_value('14')
### Loop through all combinations ###
for year in range(1, 3):
year_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbAno"]'))
year_select.select_by_index(year)
for index in range(0, len(state_options) - 1):
state_select = Select(driver.find_element(By.XPATH, '//*[#id="cmbUF"]'))
state_select.select_by_index(index)
# Click the Submit Button
submit_button = driver.find_element(By.XPATH, '//*[#id="container"]/div[2]/form/div[2]/div/input[2]')
submit_button.click()
# Pulling data from the webpage
nameof = driver.find_element(By.XPATH, '//*[#id="arearelatorio"]/div[1]/div/table[1]/tbody/tr[2]').text
total_balance = driver.find_element(By.XPATH, '//*[#id="arearelatorio"]/div[1]/div/table[3]/tbody/tr[9]/td[2]').text
paid_expenses = driver.find_element(By.XPATH, '//*[#id="arearelatorio"]/div[1]/div/table[4]/tbody/tr[11]/td[4]').text
# Update Dictionary with the new info
totals.update({nameof: [total_balance, paid_expenses, year]})
print([nameof, year])
driver.back()
# Print the final Dictionary and quit
print(len(totals))
print(totals)
driver.quit()
#Alex Karamfilov figured it out with his comment:
"Just a wild guess but is it possible that you overwrite the value for the same key in the dictionary. Since this is a dictionary and keys should be unique, this might be the reason to have only the second iteration values "
It was a dumb error on my part. The keys were the same during each iteration, so it was just modifying the values rather than creating a new key-value pair.
Related
I tried to extract data from below site but I don't know how to put the xpath in the loop "for", because the loop needs to be convert xpath to str, could you do me a favor and help me:
Site: https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/imp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-01-01&r9=2022-05-01
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome('C:\Webdriver\chromedriver.exe')
driver.get('https://www150.statcan.gc.ca/n1/pub/71-607-x/71-607-x2021004-eng.htm')
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[#id="cimt_import"]/p[1]/a')
# clicking on the button
button.click()
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[#id="topic3s"]')
# clicking on the button
button.click()
time.sleep(2)
# finding the start year:2022 from scroll
element_drop_down_startYear = driver.find_element_by_xpath('//*[#id="fromYear"]/option[1]')
element_drop_down_startYear.click()
# finding the start month from:January scroll
element_drop_down_startMonth = driver.find_element_by_xpath('//*[#id="fromMonth"]/option[1]')
element_drop_down_startMonth.click()
# finding the End year from scroll
element_drop_down_endYear = driver.find_element_by_xpath('//*[#id="toYear"]/option[1]')
element_drop_down_endYear.click()
# finding the End month from scroll
element_drop_down_endmonth = driver.find_element_by_xpath('//*[#id="toMonth"]/option[5]')
element_drop_down_endmonth.click()
# finding the specific Chapter
element_drop_down_specificChapter = driver.find_element_by_xpath('//*[#id="report_hs"]/option[1]')
element_drop_down_specificChapter.click()
time.sleep(1)
# finding the specific Commodity from the list
element_drop_down_specific_commodity = driver.find_element_by_xpath('//*[#id="report_hs"]/option[2]')
element_drop_down_specific_commodity.click()
# finding the specific Commodity from the list
element_drop_down_specific_button= driver.find_element_by_xpath('//*[#id="report"]/div[1]/div[3]/div[5]/p[2]/button')
element_drop_down_specific_button.click()
#--------------------------------------------------------------------
cel = 1
for cel in rane(25):
x = driver.find_element_by_xpath('//*[#id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[4]")
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[7]")
print("//*[#id="report_table"]/tbody/tr["+ cel +"]/td[8]/abbr")
time.sleep(3)
You need to find the element before printing it, otherwise you're printing a string. I think what you want to do is in each iteration of the for loop print those selectors? if so find the elements like so, then print them.
for i in range(25):
x = driver.find_element_by_xpath('//*[#id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
element_1 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[4]')
element_2 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[7]')
element_3 = driver.find_element_by_xpath(f'//*[#id="report_table"]/tbody/tr[{i}]/td[8]/abbr')
If you inspect the Network tab, you can see that webpage is pulling the table data from
https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01
Scrape that json page instead:
import requests
r = requests.get('https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01')
print(r.json())
I want to navigate through all the continents/ countries here and collect the tables into a pandas data frame, but sometimes the process clicks on the same link a couple of times before continuing on to the next. This is my current implementation:
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
DRIVER_PATH = '/path/to/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
driver.get('https://www.ertms.net/deployment-world-map/')
continents = driver.find_element(by='id', value='panel')
continent_names = continents.text.split()
# navigating through continent links
for i, cont in enumerate(continent_names):
cont_buttons = driver.find_elements_by_class_name('accordion')
continent_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(cont_buttons[i + 1]))
time.sleep(0.5)
ActionChains(driver).move_to_element(continent_element).click().perform()
time.sleep(3)
child_buttons = driver.find_elements_by_class_name('accordion')
# going through country links for each continent. Here is where the same link is sometimes clicked twice
for j, country in enumerate(child_buttons):
time.sleep(3)
child_buttons = driver.find_elements_by_class_name('accordion')
country_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(child_buttons[j]))
time.sleep(0.5)
ActionChains(driver).move_to_element(country_element).click().perform()
# going back to page with list of countries for current continent
back_button = driver.find_element_by_class_name('go-back')
driver.execute_script("arguments[0].click();", back_button)
time.sleep(3)
# going back to list of continents
back_button = driver.find_element_by_class_name('go-back')
driver.execute_script("arguments[0].click();", back_button)
time.sleep(3)
I navigate around using EC.element_to_be_clickable and a combination of the By.LINK_TEXT or find_elements_by_class_name methods. Any advice on best practices would be appreciated.
I have this link that I have to scrape parts from: https://partsurfer.hp.com.
To get to each part I first need to cycle through a series of clicks to make the elements visible. An example is the image below:
My code, however gets stuck after going through one cycle. I cannot seem to get it to go through several times. Please take a look at my code. I would appreciate any ideas to assist me write the loop properly.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from time import sleep
options1 = webdriver.ChromeOptions()
options1.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(executable_path="~~chromedriver.exe", options=options1)
url = 'https://partsurfer.hp.com'
driver.get(url)
sleep(5)
# deal with popup
driver.find_element_by_xpath("//button[#id='onetrust-accept-btn-handler']").click()
# select country and reload the page
dropdowns = Select(driver.find_element_by_id('ctl00_BodyContentPlaceHolder_ddlCountry'))
dropdowns.select_by_value('ZA')
driver.refresh()
sleep(5)
# click hierarchy and cycle through the list to the parts
hierarchy_click = driver.find_element_by_xpath("//a[#id='ctl00_BodyContentPlaceHolder_aHierarchy']/span[#class='ie_bg']/span").click()
sleep(5)
category_list = driver.find_elements_by_xpath("//table[#width='650']/tbody/tr/td/a")
for a in category_list:
breadcrumb1 = a.text
print(breadcrumb1)
a.click()
sleep(2)
series_list = driver.find_elements_by_xpath("//div[#id='ctl00_BodyContentPlaceHolder_HierarchyTreen1Nodes']/table/tbody/tr//a")
for b in series_list:
breadcrumb2 = b.text
print(breadcrumb2)
b.click()
sleep(2)
series_2 = driver.find_elements_by_xpath("//div[#id='ctl00_BodyContentPlaceHolder_HierarchyTreen2Nodes']/table/tbody/tr//a")
for c in series_2:
breadcrumb3 = c.text
print(breadcrumb3)
c.click()
sleep(2)
series_3 = driver.find_elements_by_xpath("//div[#id='ctl00_BodyContentPlaceHolder_HierarchyTreen3Nodes']/table/tbody/tr//a")
for d in series_3:
breadcrumb4 = d.text
print(breadcrumb4)
d.click()
sleep(2)
series_4 = driver.find_elements_by_xpath("//div[#id='ctl00_BodyContentPlaceHolder_HierarchyTreen4Nodes']/table/tbody/tr//a")
for e in series_4:
breadcrumb5 = e.text
print(breadcrumb5)
e.click()
sleep(2)
models = driver.find_elements_by_xpath("//table[#class='table_sortable']/tbody//a")
for model in models:
model_num = model.text
print(model_num)
model.click()
sleep(5)
# model number = //span[#id='ctl00_BodyContentPlaceHolder_lblProductNumber']
table_rows = driver.find_elements_by_xpath("//div[#id='ctl00_BodyContentPlaceHolder_dvProdinfo']/table/tbody/tr")
for row in table_rows:
print(row.text)
My code isn't complete, so would love to get any inputs to improve its efficiency.
wait=WebDriverWait(driver,10)
driver.get('https://partsurfer.hp.com')
# deal with popup
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button#onetrust-accept-btn-handler"))).click()
# select country and reload the page
dropdowns = Select(driver.find_element_by_id('ctl00_BodyContentPlaceHolder_ddlCountry'))
dropdowns.select_by_value('ZA')
# click hierarchy and cycle through the list to the parts
wait.until(EC.element_to_be_clickable((By.XPATH,"//a[#id='ctl00_BodyContentPlaceHolder_aHierarchy']/span[#class='ie_bg']/span"))).click()
category_list = driver.find_elements_by_xpath("//table[#width='650']/tbody/tr/td/a")
for i in range(1,len(category_list)):
wait.until(EC.element_to_be_clickable((By.XPATH,"(//table[#width='650']/tbody/tr/td/a)["+str(i)+"]"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#ctl00_BodyContentPlaceHolder_HierarchyTreet0"))).click()
Here's a small demo to go through the top level. You want to use the Top Hierarchy to reset yourself.
Imports:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
I'm trying to go to every next page using the below code.
it collects data from page Number 1. but when I try to loop it and go to the next page it gives me an error.
Web page : https://register.fca.org.uk/s/search?q=capital&type=Companies
this is the code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
service = Service('linkto crome driver')
service.start()
driver = webdriver.Remote(service.service_url)
driver.get(url)
time.sleep(12)
for j in range(346):
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for i in range(len(divs)):
time.sleep(10)
d = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
RN = ''
d[i].click()
time.sleep(12)
try:
RNData = driver.find_elements_by_xpath('//*[#id="profile-header"]/div[1]/div/div/div/div/div/div[1]/div[2]/div/div')
RN = RNData[0].text.split(':')[1].strip()
print(RN)
except Exception as e5:
pass
if i == (len(divs) - 1):
pass
else:
driver.execute_script("window.history.go(-1)")
bt = driver.find_elements_by_xpath('//*[#id="-pagination-next-btn"]')
bt[0].click()
This is the error:
IndexError: list index out of range
How can I solve this problem?
I guess the problem is as following:
bt = driver.find_element_by_xpath('//*[#id="-pagination-next-btn"]')
returns a single web element object, it's not a list, so you can't apply indexing on it with bt[0]
UPD:
After changing from find_element_by_xpath to find_elements_by_xpath you still getting IndexError: list index out of range there because you were in the inner page and performed driver back action.
Immediately after that you are trying to get the next page button while the main page is still not loaded. This actually returns you an empty list
bt = driver.find_elements_by_xpath('//*[#id="-pagination-next-btn"]')
that's why you can't apply bt[0] on an empty list object.
Your problem is this:
if i == (len(divs) - 1):
pass
else:
driver.execute_script("window.history.go(-1)")
After clicking the last link, you are not navigating back to the initial page, which is where the pagination button is. I don't think you need this condition at all, so your code could be:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
service = Service('linkto crome driver')
service.start()
driver = webdriver.Remote(service.service_url)
driver.get(url)
time.sleep(12)
for j in range(346):
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for i in range(len(divs)):
time.sleep(10)
d = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
RN = ''
d[i].click()
time.sleep(12)
try:
RNData = driver.find_elements_by_xpath('//*[#id="profile-header"]/div[1]/div/div/div/div/div/div[1]/div[2]/div/div')
RN = RNData[0].text.split(':')[1].strip()
print(RN)
except Exception as e5:
pass
driver.execute_script("window.history.go(-1)")
bt = driver.find_elements_by_xpath('//*[#id="-pagination-next-btn"]')
bt[0].click()
I'm scraping this website using Python and Selenium. I have the code working but it currently only scrapes the first page, I would like to iterate through all the pages and scrape them all but they handle pagination in a weird way how would I go through the pages and scrape them one by one?
Pagination HTML:
<div class="pagination">
First
Prev
1
<span class="current">2</span>
3
4
Next
Last
</div>
Scraper:
import re
import json
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options,
executable_path=r'/Users/weaabduljamac/Downloads/chromedriver')
url = 'https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList'
driver.get(url)
def getData():
data = []
rows = driver.find_element_by_xpath('//*[#id="form1"]/table/tbody').find_elements_by_tag_name('tr')
for row in rows:
app_number = row.find_elements_by_tag_name('td')[1].text
address = row.find_elements_by_tag_name('td')[2].text
proposals = row.find_elements_by_tag_name('td')[3].text
status = row.find_elements_by_tag_name('td')[4].text
data.append({"CaseRef": app_number, "address": address, "proposals": proposals, "status": status})
print(data)
return data
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
all_data.extend( getData() )
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
if __name__ == "__main__":
main()
Before moving on to automating any scenario, always write down the manual steps you would perform to execute the scenario. Manual steps for what you want to (which I understand from the question) is -
1) Go to site - https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList
2) Select first week option
3) Click search
4) Get the data from every page
5) Load the url again
6) Select second week option
7) Click search
8) Get the data from every page
.. and so on.
You are having a loop to select different weeks but inside each loop iteration for the week option, you also need to include a loop to iterate over all the pages. Since you are not doing that, your code is returning only the data from the first page.
Another problem is with how you are locaing the 'Next' button -
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
You are selecting the 4th <a> element which is ofcourse not robust because in different pages, the Next button's index will be different. Instead, use this better locator -
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
Logic for creating loop which will iterate through pages -
First you will need the number of pages. I did that by locating the <a> immediately before the "Next" button. As per the screenshot below, it is clear that the text of this element will be equal to the number of pages -
-
I did that using following code -
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
Now once you have number of pages as number_of_pages, you only need to click "Next" button number_of_pages - 1 times!
Final code for your main function-
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
for j in range(number_of_pages - 1):
all_data.extend(getData())
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
time.sleep(1)
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
Following approach is simply worked for me.
driver.find_element_by_link_text("3").click()
driver.find_element_by_link_text("4").click()
....
driver.find_element_by_link_text("Next").click()
first get the total pages in the pagination, using
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,1')
ins.find_element_by_class_name("pagination")
source = BeautifulSoup(ins.page_source)
div = source.find_all('div', {'class':'pagination'})
all_as = div[0].find_all('a')
total = 0
for i in range(len(all_as)):
if 'Next' in all_as[i].text:
total = all_as[i-1].text
break
Now just loop through the range
for i in range(total):
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,{}'.format(count))
keep incrementing the count and get the source code for the page and then get the data for it.
Note: Don't forget the sleep when clicking on going form one page to another