I'm pretty new to web scraping and would appreciate any advice for the scenarios below:
I'm trying to produce a home loans listing table using data from https://www.canstar.com.au/home-loans/
I'm mainly trying to get listings values like the ones below:
Homestar Finance | Star Essentials P&I 80% | Variable
Unloan | Home Loan LVR <80% | Variable
TicToc Home Loans | Live-in Variable P&I | Variable
ubank | Neat Home Loan Owner Occupied P&I 70-80% | Variable
and push them into a nested table
results = [[Homestar Finance, Star Essentials P&I 80%, Variable], etc, etc]
My first attempt, I've used BeautifulSoup entirely and practice on an offline version of the site.
import pandas as pd
from bs4 import BeautifulSoup
with open('/local/path/canstar.html', 'r') as canstar_offline :
content = canstar_offline.read()
results = [['Affiliate', 'Product Name', 'Product Type']]
soup = BeautifulSoup(content, 'lxml')
for listing in soup.find_all('div', class_='table-cards-container') :
for listing1 in listing.find_all('a') :
if listing1.text.strip() != 'More details' and listing1.text.strip() != '' :
results.append(listing1.text.strip().split(' | '))
df = pd.DataFrame(results[1:], columns=results[0]).to_dict('list')
df2 = pd.DataFrame(df)
print(df2)
I pretty much got very close to what I wanted, but unfortunately it doesn't work for the actual site cause it looks like I'm getting blocked for repeated requests.
So I tried this again on Selenium but now I'm stuck.
I tried using as much of the transferrable filtering logic that I used from BS, but I can't get anywhere close to what I had using Selenium.
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
listing = table.find_element(By.TAG_NAME, 'a')
print(listing.text)
This version (above) only returns one listing (I'm trying to get the entire table through iteration)
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
# listing = table.find_element(By.TAG_NAME, 'a')
print(table.text)
This version (above) looks like it gets all the text from the 'table-cards-container' class, but I'm unable to filter through it to just get the listings.
I think you can try something like this, I hope the comments in the code explain what it is doing.
# Needed libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Initiate the driver and navigate
driver = webdriver.Chrome()
url = 'https://www.canstar.com.au/home-loans'
driver.get(url)
# We save the loans list
loans = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//cnslib-table-card")))
# We make a loop once per loan in the loop
for i in range(1, len(loans)):
# With this Xpath I save the title of the loan
loan_title = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//a)[1]"))).text
print(loan_title)
# With this Xpath I save the first percentaje we see for the loan
loan_first_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[1]"))).text
print(loan_first_percentaje)
# With this Xpath I save the second percentaje we see for the loan
loan_second_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[3]"))).text
print(loan_second_percentaje)
# With this Xpath I save the amount we see for the loan
loan_amount = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[5]"))).text
print(loan_amount)
Related
I would like to make web scraping using Selenium in all pages of the website below, but, until now, I could make it only in the first page. I also put data on a Pandas dataframe. How can I do this operation in all pages of this website? For now, I have:
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome(executable_path=r"C:/Users/Usuario/.spyder-py3/chromedriver.exe")
driver.get("https://www.mercadolivre.com.br/ofertas")
driver.implicitly_wait(3)
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
df = pd.DataFrame()
produtos = []
for x in tituloProduto:
produtos.append(x.text)
preco = []
for x in price:
preco.append(x.text)
df['produto'] = produtos
df['preco'] = preco
df.head()
produto preco
Furadeira Parafusadeira Com Impacto 20v 2 Bate... R$ 34232
Sony Playstation 4 Slim 1tb Mega Pack: Ghost O... R$ 2.549
Tablet Galaxy A7 Lite T225 4g Ram 64gb Grafite... R$ 1.199
Smart Tv Philco Ptv55q20snbl Dled 4k 55 110v/220v R$ 2.799
Nintendo Switch 32gb Standard Cor Vermelho-néo... R$ 2.349
I found the website you want to scrape has 209 pages in total and can be accessed with the page number: https://www.mercadolivre.com.br/ofertas?page=2, so it should be not too difficult.
One thing you can do is to loop 209 times for getting the data from each page. A better approach would be to identify the "next page" button and loop until it's unavailable, but simply using the given page number (209) is easier, so will use that.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
driver = webdriver.Chrome(executable_path=r".../chromedriver.exe")
...
# Initialize outside the loop
preco = []
produtos = []
for i in range(209):
# Parse each page with the code you already have.
driver.get('https://www.mercadolivre.com.br/ofertas?page=' + str(i))
# You may have to wait for each page to load
wait = WebDriverWait(driver, 10)
wait.until(ec.visibility_of_element_located((By.CSS_SELECTOR, "a.sc-2vbwj7-22.blyzsR")))
# If you want to speed things up, you can process them in parallel
# But you should do this only if it's worth it since it will take development time.
# Get the variables you want
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
for x in tituloProduto:
produtos.append(x.text)
for x in price:
preco.append(x.text)
Store list in DataFrame and do what you want with it.
You can use this code.
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome(
executable_path=r"C:/Users/Usuario/.spyder-py3/chromedriver.exe")
url = "https://www.mercadolivre.com.br/ofertas?page="
df = pd.DataFrame()
produtos = []
preco = []
for i in range(1, 209):
driver.get(url + str(i))
driver.implicitly_wait(3)
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
for x in tituloProduto:
produtos.append(x.text)
for x in precoProduto:
preco.append(x.text)
df['produto'] = produtos
df['preco'] = preco
print(df)
Hope to be helpful for you. Thanks.
What you could do is find the pagination button and set it to a next_page variable like so:
next_page = response.xpath('XPATH HERE').css('a::attr(href)').extract_first()
and then call it like so:
yield scrapy.Request(next_page, callback=self.parse)
Im very new to this, but I have an idea for a website and I want to give it a good go, my aim is to scrape the Asda website for prices and products, more specifically in this case whiskey. I want to grab the name and price of all the whiskey on the Asda website and put it into a nice table on my website, however I am having problems doing so, my code so far is getting syntax error, can anyone help?
the code so far is..
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
res = driver.execute_script('return document.documentElement.outerHTML')
html_soup = BeautifulSoup(res, 'html.parser')
type(html_soup)
driver.quit
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
whiskey_container = html_soup.find('div', {'class': 'co-product-lazy-container'})
for whiskey in whiskey_container:
name = whiskey.find('a', {'class': 'co-product__anchor'})
price = whiskey.find('div', {'class': 'co-product__price'})
print(name, price)
Try it:
# for wait time better than time.sleep()
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time # or WebDriverWait
import csv # for saving data in table
# save csv file
def save_csv(dct):
'''
dct - dictionary with our data:
"cap",
"title",
"price"
'''
name = "file.csv" # file name, it can choice what you want
print("[INFO] saving...") # for see that function works
with open(name, 'a', encoding="utf-8") as f: # open file for writing "a"
# this need for writing data to table
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['title'],
dct['price'],
))
def scroll(driver):
# for open all interesting us data
for i in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650?facets=shelf%3A1579926650%3A0000&nutrition=&sortBy=&page=0")
for i in range(2): # 2 because we have only two page with data
element = WebDriverWait(driver, 30) # or time.sleep(30)
scroll(driver) # for open all interesting us data
# get all data to one list in beautifulsoup type
data = driver.find_elements_by_css_selector(".co-lazy-product-container .co-item")
# iterating interesting data and create dictionary with data
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["title"] = body[1]
items["price"] = body[-2]
save_csv(items)
# pagination
driver.find_element_by_css_selector(".co-pagination__last-page").click()
# close driver
driver.quit()
you have syntax error, you have ")" missing :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
it should be :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
--
btw your code won't work. you have couple of logical errors.
and I doubt you can scrape that page with your current code.
I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.
Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion
I am scraping this website :
https://login.aviva.com.sg/directinsurance/homeinsurance.htm
I want to fill all the form elements from a csv file (we will call it "profil". When I try to fill the form with a unique "profil" everything is working fine. But when I do my loop on the different profiles I have multiple problems:
sometimes I can't "get" the full address from the postalcode (see below) and so I can't get the final quotation
the driver is refreshing but I don't have the quote from the different people in profil.
Here is a type of profil that I have:
profil = [["MRS ","Corinne","SIMON","F","M","600 ","No, for myself and my family","72603190","2017-H1","CO ","Ridout Road","10","91 - 124","27 - 38","099197","S4553141D","1958","5","1"],
["MS ","Corinne","MOREAU","F","D","610 ","Yes, for myself","63856280","2017-H1","CO ","Stevens","10","38 - 208","24 - 40","099198","S9186686B","1999","10","1"],
["MDM ","Corinne","DUBOIS","F","W","620 ","Yes,for my family","71852991","2017-H1","CO ","Stevens","10","38 - 208","24 - 40","099200","S2243858A","1974","2","1"]
]`
This is the python code that I have made so fare :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import csv
driver = webdriver.Firefox()
driver.get("https://login.aviva.com.sg/directinsurance/homeinsurance.htm")
for people in profil:
dropdown_salutation =
Select(driver.find_element_by_name("person.salutationRef"))
dropdown_occupation =
Select(driver.find_element_by_name("person.occupationRef"))
dropdown_maritalstatus =
Select(driver.find_element_by_name("person.maritalstat"))
dropdown_gender=Select(driver.find_element_by_name("person.gender"))
dropdown_dobDay= Select(driver.find_element_by_name("dobDay"))
dropdown_dobMonth=Select(driver.find_element_by_name("dobMonth"))
dropdown_dobYear=Select(driver.find_element_by_name("dobYear"))
dropdown_declaration1=Select(driver.find_element_by_name("declaration1"))
dropdown_declaration2=Select(driver.find_element_by_name("declaration2"))
#now we look for all the other element that we can fill (we select by id first)
FamilyName_input = driver.find_element_by_id("surname")
GivenName_input = driver.find_element_by_id("givname")
NRIC_input = driver.find_element_by_id("nric")
PostalCode = driver.find_element_by_id("postalCode")
MobileNo = driver.find_element_by_id("textfield5")
Email = driver.find_element_by_id("email")
# Then we fill everything
dropdown_salutation.select_by_value(people[0])
GivenName_input.send_keys(people[1])
FamilyName_input.send_keys(people[2])
dropdown_gender.select_by_value(people[3])
dropdown_maritalstatus.select_by_value(people[4])
dropdown_occupation.select_by_value(people[5])
MobileNo.send_keys(people[7])
NRIC_input.send_keys(people[15])
dropdown_dobYear.select_by_value("people[16]")
dropdown_dobMonth.select_by_value(people[17])
dropdown_dobDay.select_by_value(people[18])
Email.send_keys("ada#hotmail.com")
dropdown_declaration1.select_by_value("Y")
dropdown_declaration2.select_by_value("Y")
PostalCode.send_keys(people[14])
wait = WebDriverWait(driver, 30)
# Now we can get the full address based on the postal code we provide
#here I have a first problem
driver.find_element_by_id("btnAddress").click()
wait = WebDriverWait(driver, 30)
element = wait.until(EC.element_to_be_clickable((By.ID, 'immediateFamilySaf')))
dropdown_declaration3=
Select(driver.find_element_by_name("policy.immediateFamilySaf"))
dropdown_declaration3.select_by_value("N")
# Now we click on next to move forward on the second page of the form
Next = driver.find_element_by_css_selector("a[onclick*=checkFirstTab]")
Next.click()
UnitNo =
driver.find_element_by_css_selector("a[onclick*=proceedNoUnitNo]")
UnitNo.click()
#Now we can fill the "cover needed" form
dropdown_plan=Select(driver.find_element_by_name("homeProd.planTypeRef"))
dropdown_dwelling =
Select(driver.find_element_by_name("homeProd.dwellingTypeRef"))
dropdown_insureadr=
Select(driver.find_element_by_name("homeProd.addressType"))
dropdown_coverday=Select(driver.find_element_by_name("coverStartDay"))
dropdown_covermonth=Select(driver.find_element_by_name("coverStartMonth"))
dropdown_coveryear=Select(driver.find_element_by_name("coverStartYear"))
dropdown_plan.select_by_value("HI ")
dropdown_dwelling.select_by_value(people[9])
dropdown_insureadr.select_by_value("S")
dropdown_coverday.select_by_value("1")
dropdown_covermonth.select_by_value("4")
dropdown_coveryear.select_by_value("2018")
# Now we can grab the next button and pass to the third tab
SecondTab = driver.find_element_by_name("_target0")
SecondTab.click()
#now we can grab the quote
ThirdTab = driver.find_element_by_name("_target1")
ThirdTab.click()
time.sleep(3)
driver.save_screenshot('img' + people[2] + '.png')
html= driver.page_source
doc=
# We can feed that into Beautiful Soup
doc = BeautifulSoup(html, "html.parser")
rows = doc.find('table', id='table-termsofplan').find_all('td', attrs={'class': None})
premiums = []
for row in rows:
# Find the ones that don't have 'style' as an attribute
if 'style' in row.attrs:
# Skip it! It's a header or footer row
pass
else:
premium={
'type of plan': rows[1].text,
'12 Months premium':rows[2].text,
'24 Months premium':rows[3].text,
'36 Months premium':rows[4].text,
'Total Premium 12 Months':rows[10].text,
'Total Premium 24 Months':rows[11].text,
'Total Premium 36 Months':rows[12].text,
'Goods and services Tax 12 Months':rows[14].text,
'Goods and services Tax 24 Months':rows[15].text,
'Goods and services Tax 36 Months':rows[16].text,
'Single Payment 12 Months':rows[19].text,
'Single Payment 24 Months':rows[20].text,
'Single Payment 36 Months':rows[21].text,
}
premiums.append(premium)
driver.get("https://login.aviva.com.sg/directinsurance/homeinsurance.htm")
driver.close()
import pandas as pd
premium_df = pd.DataFrame(premiums)
premium_df.to_csv("premium.csv", index=False)