I am trying to scrape data from a website that has a multilevel drop-down menu every time an item is selected it changes the sub items for sub drop-downs.
problem is that for every loop it extracts same sub items from the drop down items. the selection happens but it do not update the items on behalf of new selection from loop
can any one help me why I am not getting the desired results.
Perhaps this is because my drop-down list is in java Script or something.
for instance like this manue in the picture below:
i have gone this far:
enter code here
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import csv
//#from selenium.webdriver.support import Select
import time
print ("opening chorome....")
driver = webdriver.Chrome()
driver.get('https://www.wheelmax.com/')
time.sleep(10)
csvData = ['Year', 'Make', 'Model', 'Body', 'Submodel', 'Size']
//#variables
yeart = []
make= []
model=[]
body = []
submodel = []
size = []
Yindex = Mkindex = Mdindex = Bdindex = Smindex = Sindex = 0
print ("waiting for program to set variables....")
time.sleep(20)
print ("initializing and setting variables....")
//#initializing Year
Year = Select(driver.find_element_by_id("icm-years-select"))
Year.select_by_value('2020')
yr = driver.find_elements(By.XPATH, '//*[#id="icm-years-select"]')
time.sleep(15)
//#initializing Make
Make = Select(driver.find_element_by_id("icm-makes-select"))
Make.select_by_index(1)
mk = driver.find_elements(By.XPATH, '//*[#id="icm-makes-select"]')
time.sleep(15)
//#initializing Model
Model = Select(driver.find_element_by_id("icm-models-select"))
Model.select_by_index(1)
mdl = driver.find_elements(By.XPATH, '//*[#id="icm-models-select"]')
time.sleep(15)
//#initializing body
Body = Select(driver.find_element_by_id("icm-drivebodies-select"))
Body.select_by_index(1)
bdy = driver.find_elements(By.XPATH, '//*[#id="icm-drivebodies-select"]')
time.sleep(15)
//#initializing submodel
Submodel = Select(driver.find_element_by_id("icm-submodels-select"))
Submodel.select_by_index(1)
sbm = driver.find_elements(By.XPATH, '//*[#id="icm-submodels-select"]')
time.sleep(15)
//#initializing size
Size = Select(driver.find_element_by_id("icm-sizes-select"))
Size.select_by_index(0)
siz = driver.find_elements(By.XPATH, '//*[#id="icm-sizes-select"]')
time.sleep(5)
Cyr = Cmk = Cmd = Cbd = Csmd = Csz = ""
print ("fetching data from variables....")
for y in yr:
obj1 = driver.find_element_by_id("icm-years-select")
Year = Select(obj1)
Year.select_by_index(++Yindex)
obj1.click()
#obj1.click()
yeart.append(y.text)
Cyr = y.text
time.sleep(10)
for m in mk:
obj2 = driver.find_element_by_id("icm-makes-select")
Make = Select(obj2)
Make.select_by_index(++Mkindex)
obj2.click()
#obj2.click()
make.append(m.text)
Cmk = m.text
time.sleep(10)
for md in mdl:
Mdindex =0
obj3 = driver.find_element_by_id("icm-models-select")
Model = Select(obj3)
Model.select_by_index(++Mdindex)
obj3.click()
#obj3.click(clickobj)
model.append(md.text)
Cmd = md.text
time.sleep(10)
Bdindex = 0
for bd in bdy:
obj4 = driver.find_element_by_id("icm-drivebodies-select")
Body = Select(obj4)
Body.select_by_index(++Bdindex)
obj4.click()
#obj4.click(clickobj2)
body.append(bd.text)
Cbd = bd.text
time.sleep(10)
Smindex = 0
for sm in sbm:
obj5 = driver.find_element_by_id("icm-submodels-select")
Submodel = Select(obj5)
obj5.click()
Submodel.select_by_index(++Smindex)
#obj5.click(clickobj5)
submodel.append(sm.text)
Csmd = sm.text
time.sleep(10)
Sindex = 0
for sz in siz:
Size = Select(driver.find_element_by_id("icm-sizes-select"))
Size.select_by_index(++Sindex)
size.append(sz.text)
Scz = sz.text
csvData += [Cyr, Cmk, Cmd, Cbd,Csmd, Csz]
Because of https://www.wheelmax.com has multilevel drop-down menu dependent on each other for example if you select Select Year drop down option, after selected year based on Select Make drop down is enable and display option based on the selected year option.
So basically you need to use Selenium package for handle dynamic option.
Install selenium web driver as per your browser
Download chrome web driver :
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
unzip ~/Downloads/chromedriver_linux64.zip -d ~/Downloads
chmod +x ~/Downloads/chromedriver
sudo mv -f ~/Downloads/chromedriver /usr/local/share/chromedriver
sudo ln -s /usr/local/share/chromedriver /usr/local/bin/chromedriver
sudo ln -s /usr/local/share/chromedriver /usr/bin/chromedriver
selenium tutorial
https://selenium-python.readthedocs.io/
Eg. using selenium to select multiple dropdown options
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
driver = webdriver.Chrome()
driver.get('https://www.wheelmax.com/')
time.sleep(4)
selectYear = Select(driver.find_element_by_id("icm-years-select"))
selectYear.select_by_value('2019')
time.sleep(2)
selectMakes = Select(driver.find_element_by_id("icm-makes-select"))
selectMakes.select_by_value('58')
Update:
select drop down option value or count total options
for option in selectYear.options:
print(option.text)
print(len(selectYear.options))
Se more
How to extract data from a dropdown menu using python beautifulsoup
The page does a callback to populate with years. Simply mimic that.
If you actually need to change years and select from dependent drop downs, which becomes a different question, you need browser automation e.g. selenium, or to manually perform this and inspect network tab to see if there is an xhr request you can mimic to submit your choices.
import requests
r = requests.get('https://www.iconfigurators.com/json2/?returnType=json&bypass=true&id=13898&callback=yearObj').json()
years = [item['year'] for item in r['years']]
print(years)
I guess the reason you can't parse the years with beautiful soup is because the 'select' tag containing the 'option' tags with all the years is not present yet/is hidden at the moment when beautiful soup downloads the page. It is added to the DOM by executing additional JavaScript I assume. If you look at the DOM of the loaded page using the developer tools of your browser, for example F12 for Mozilla, you'll see that the tag containing the information you look for is: <select id="icm-years-select"">. If you try to parse for this tag with the object downloaded with beautiful soup, you get an empty list of tag objects:
from bs4 import BeautifulSoup
from requests import get
response = get('https://www.wheelmax.com/')
yourSoup = BeautifulSoup(response.text, "lxml")
print(len(yourSoup.select('div #vehicle-search'))) // length = 1 -> visible
print()
print(len(yourSoup.select('#icm-years-select'))) // length = 0 -> not visible
So if you want to get the years by using Python by all means, I guess you might try to click on the respective tag and then parse again using some combination of requests/beautiful soup/ or the selenium module which will require a bit more digging :-)
Otherwise if you just quickly need the years parsed, use JavaScript:
countYears = document.getElementById('icm-years-select').length;
yearArray = [];
for (i = 0; i < countYears; i++) {yearArray.push(document.getElementById('icm-years-select')[i].value)};
Related
I'm pretty new to web scraping and would appreciate any advice for the scenarios below:
I'm trying to produce a home loans listing table using data from https://www.canstar.com.au/home-loans/
I'm mainly trying to get listings values like the ones below:
Homestar Finance | Star Essentials P&I 80% | Variable
Unloan | Home Loan LVR <80% | Variable
TicToc Home Loans | Live-in Variable P&I | Variable
ubank | Neat Home Loan Owner Occupied P&I 70-80% | Variable
and push them into a nested table
results = [[Homestar Finance, Star Essentials P&I 80%, Variable], etc, etc]
My first attempt, I've used BeautifulSoup entirely and practice on an offline version of the site.
import pandas as pd
from bs4 import BeautifulSoup
with open('/local/path/canstar.html', 'r') as canstar_offline :
content = canstar_offline.read()
results = [['Affiliate', 'Product Name', 'Product Type']]
soup = BeautifulSoup(content, 'lxml')
for listing in soup.find_all('div', class_='table-cards-container') :
for listing1 in listing.find_all('a') :
if listing1.text.strip() != 'More details' and listing1.text.strip() != '' :
results.append(listing1.text.strip().split(' | '))
df = pd.DataFrame(results[1:], columns=results[0]).to_dict('list')
df2 = pd.DataFrame(df)
print(df2)
I pretty much got very close to what I wanted, but unfortunately it doesn't work for the actual site cause it looks like I'm getting blocked for repeated requests.
So I tried this again on Selenium but now I'm stuck.
I tried using as much of the transferrable filtering logic that I used from BS, but I can't get anywhere close to what I had using Selenium.
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
listing = table.find_element(By.TAG_NAME, 'a')
print(listing.text)
This version (above) only returns one listing (I'm trying to get the entire table through iteration)
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
# listing = table.find_element(By.TAG_NAME, 'a')
print(table.text)
This version (above) looks like it gets all the text from the 'table-cards-container' class, but I'm unable to filter through it to just get the listings.
I think you can try something like this, I hope the comments in the code explain what it is doing.
# Needed libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Initiate the driver and navigate
driver = webdriver.Chrome()
url = 'https://www.canstar.com.au/home-loans'
driver.get(url)
# We save the loans list
loans = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//cnslib-table-card")))
# We make a loop once per loan in the loop
for i in range(1, len(loans)):
# With this Xpath I save the title of the loan
loan_title = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//a)[1]"))).text
print(loan_title)
# With this Xpath I save the first percentaje we see for the loan
loan_first_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[1]"))).text
print(loan_first_percentaje)
# With this Xpath I save the second percentaje we see for the loan
loan_second_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[3]"))).text
print(loan_second_percentaje)
# With this Xpath I save the amount we see for the loan
loan_amount = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[5]"))).text
print(loan_amount)
Im very new to this, but I have an idea for a website and I want to give it a good go, my aim is to scrape the Asda website for prices and products, more specifically in this case whiskey. I want to grab the name and price of all the whiskey on the Asda website and put it into a nice table on my website, however I am having problems doing so, my code so far is getting syntax error, can anyone help?
the code so far is..
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
res = driver.execute_script('return document.documentElement.outerHTML')
html_soup = BeautifulSoup(res, 'html.parser')
type(html_soup)
driver.quit
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
whiskey_container = html_soup.find('div', {'class': 'co-product-lazy-container'})
for whiskey in whiskey_container:
name = whiskey.find('a', {'class': 'co-product__anchor'})
price = whiskey.find('div', {'class': 'co-product__price'})
print(name, price)
Try it:
# for wait time better than time.sleep()
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time # or WebDriverWait
import csv # for saving data in table
# save csv file
def save_csv(dct):
'''
dct - dictionary with our data:
"cap",
"title",
"price"
'''
name = "file.csv" # file name, it can choice what you want
print("[INFO] saving...") # for see that function works
with open(name, 'a', encoding="utf-8") as f: # open file for writing "a"
# this need for writing data to table
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['title'],
dct['price'],
))
def scroll(driver):
# for open all interesting us data
for i in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650?facets=shelf%3A1579926650%3A0000&nutrition=&sortBy=&page=0")
for i in range(2): # 2 because we have only two page with data
element = WebDriverWait(driver, 30) # or time.sleep(30)
scroll(driver) # for open all interesting us data
# get all data to one list in beautifulsoup type
data = driver.find_elements_by_css_selector(".co-lazy-product-container .co-item")
# iterating interesting data and create dictionary with data
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["title"] = body[1]
items["price"] = body[-2]
save_csv(items)
# pagination
driver.find_element_by_css_selector(".co-pagination__last-page").click()
# close driver
driver.quit()
you have syntax error, you have ")" missing :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
it should be :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
--
btw your code won't work. you have couple of logical errors.
and I doubt you can scrape that page with your current code.
I'm scraping this website using Python and Selenium. I have the code working but it currently only scrapes the first page, I would like to iterate through all the pages and scrape them all but they handle pagination in a weird way how would I go through the pages and scrape them one by one?
Pagination HTML:
<div class="pagination">
First
Prev
1
<span class="current">2</span>
3
4
Next
Last
</div>
Scraper:
import re
import json
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options,
executable_path=r'/Users/weaabduljamac/Downloads/chromedriver')
url = 'https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList'
driver.get(url)
def getData():
data = []
rows = driver.find_element_by_xpath('//*[#id="form1"]/table/tbody').find_elements_by_tag_name('tr')
for row in rows:
app_number = row.find_elements_by_tag_name('td')[1].text
address = row.find_elements_by_tag_name('td')[2].text
proposals = row.find_elements_by_tag_name('td')[3].text
status = row.find_elements_by_tag_name('td')[4].text
data.append({"CaseRef": app_number, "address": address, "proposals": proposals, "status": status})
print(data)
return data
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
all_data.extend( getData() )
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
if __name__ == "__main__":
main()
Before moving on to automating any scenario, always write down the manual steps you would perform to execute the scenario. Manual steps for what you want to (which I understand from the question) is -
1) Go to site - https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList
2) Select first week option
3) Click search
4) Get the data from every page
5) Load the url again
6) Select second week option
7) Click search
8) Get the data from every page
.. and so on.
You are having a loop to select different weeks but inside each loop iteration for the week option, you also need to include a loop to iterate over all the pages. Since you are not doing that, your code is returning only the data from the first page.
Another problem is with how you are locaing the 'Next' button -
driver.find_element_by_xpath('//*[#id="form1"]/div[3]/a[4]').click()
You are selecting the 4th <a> element which is ofcourse not robust because in different pages, the Next button's index will be different. Instead, use this better locator -
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
Logic for creating loop which will iterate through pages -
First you will need the number of pages. I did that by locating the <a> immediately before the "Next" button. As per the screenshot below, it is clear that the text of this element will be equal to the number of pages -
-
I did that using following code -
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
Now once you have number of pages as number_of_pages, you only need to click "Next" button number_of_pages - 1 times!
Final code for your main function-
def main():
all_data = []
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
list_options = select.options
for item in range(len(list_options)):
select = Select(driver.find_element_by_xpath("//select[#class='formitem' and #id='selWeek']"))
select.select_by_index(str(item))
driver.find_element_by_css_selector("input.formbutton#csbtnSearch").click()
number_of_pages = int(driver.find_element_by_xpath("//a[contains(text(),'Next')]/preceding-sibling::a[1]").text)
for j in range(number_of_pages - 1):
all_data.extend(getData())
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
time.sleep(1)
driver.get(url)
with open( 'wiltshire.json', 'w+' ) as f:
json.dump( all_data, f )
driver.quit()
Following approach is simply worked for me.
driver.find_element_by_link_text("3").click()
driver.find_element_by_link_text("4").click()
....
driver.find_element_by_link_text("Next").click()
first get the total pages in the pagination, using
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,1')
ins.find_element_by_class_name("pagination")
source = BeautifulSoup(ins.page_source)
div = source.find_all('div', {'class':'pagination'})
all_as = div[0].find_all('a')
total = 0
for i in range(len(all_as)):
if 'Next' in all_as[i].text:
total = all_as[i-1].text
break
Now just loop through the range
for i in range(total):
ins.get('https://services.wiltshire.gov.uk/PlanningGIS/LLPG/WeeklyList/10702380,{}'.format(count))
keep incrementing the count and get the source code for the page and then get the data for it.
Note: Don't forget the sleep when clicking on going form one page to another
I wrote some code to extract Insider Trading information from the Toronto Stock Exchange. Using Selenium, open up this link and then, using a list of stocks, one by one input each into the form, retrieve the data and put it into another list, then do the same for the next stock.
Here is the code:
from selenium import webdriver
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()
If you run the code, you can see each stock being input into the form, the data will pop up and I attempt to retrieve it. However, when I get the text from "data", its as if its retrieved from what was visible on the page prior to submitting the form. I tried adding a wait to the code to no avail.
added a time.sleep(1) and the code works as intended.
from selenium import webdriver
import time
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
**time.sleep(1)**
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()
I have a list of search criteria saved in a csv file. I'd like to loop through each search criteria to generate the corresponding search results on a website. For each set of search results generated (which are links), I'd like to click into the link and then grab the data from the new page generated. Unfortunately, I am experiencing problems going into each link. If anyone could please kindly provide some insight, it would be much appreciated.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(10):
try:
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
links = []
for j in range(len(list_links)):
links.append(list_links[j].get_attribute('href'))
Product_Name = []
for link in links:
browser.get(link)
product = browser.find_element_by_id("placeBody_dynField1_txtTextBox")
Product_Name.append(product)
print(Product_Name)
browser.get(url)
except:
print(CAS[i])
continue
I managed to solve this with the code below. Although, the solution is a little inelegant...
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(2):
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
all_results = []
for j in list_links:
result = j.text
all_results.append(result)
for i in range(len(all_results)):
browser.find_element_by_link_text(all_results[i]).click()
browser.back()
browser.get(url)