I'm trying to scrape data inside a hidden frame; the frame is shown as follows
<!-- Content of the details tabs here -->
<div id="tabDetail_0" class="tab_content tab_detail" style="display:
block;"><iframe id="iframe_0" src="https://www.tmdn.org/tmview/get-
detail?st13=GB500000003342197" width="100%" height="600px;"
frameborder="0"></iframe></div></div></div> <!-- resultTabs -->
As you can see there is a link in the HTML, I tried to open a new webdriver instance and navigate the link and get the data, it worked however the website stopped because navigating directly these links are not allowed or limited.
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import traceback
import time
option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}
url ="https://www.tmdn.org/tmview/welcome#"
xlsName = 'D:\\test.xlsx'
records = []
start_time = time.time()
driver = webdriver.Chrome(executable_path="D:\Python\chromedriver.exe",chrome_options=option)
driver.get(url)
time.sleep(10)
driver.find_element_by_xpath('//*[#id="buttonBox"]/a').click()
time.sleep(10)
x=-1
try:
#click advanced search
driver.find_element_by_name("lnkAdvancedSearch").click()
#
time.sleep(5)
#to select Designated territories
driver.find_element_by_id('DesignatedTerritories').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
for elem in TerritoryLabelElements:
if elem.text == 'United Kingdom':
elem.click()
time.sleep(5)
driver.find_element_by_id('DesignatedTerritories').click()
#
time.sleep(5)
#to select from Trade mark offices
driver.find_element_by_id('SelectedOffices').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'GB United Kingdom ( UKIPO )':
elem.click()
time.sleep(5)
driver.find_element_by_id('SelectedOffices').click()
#Trade mark status
driver.find_element_by_id('TradeMarkStatus').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'Filed':
elem.click()
if elem.text == 'Registered':
elem.click()
time.sleep(5)
driver.find_element_by_id('TradeMarkStatus').click()
# dates
startdate = driver.find_element_by_id("ApplicationDateFrom")
startdate.clear()
startdate.send_keys ('01-10-2018')
enddate = driver.find_element_by_id("ApplicationDateTo")
enddate.clear()
enddate.send_keys ('31-10-2018' )
# click search
time.sleep(5)
driver.find_element_by_id("SearchCopy").click()
time.sleep(5)
html= driver.page_source
soup = BeautifulSoup(html,'html.parser')
tbl = soup.find("table", id="grid")
driver.find_element_by_link_text('100').click()
time.sleep(5)
# #LOOP
for i in range(1, 73):
html= driver.page_source
soup = BeautifulSoup(html,'html.parser')
tbl = soup.find("table", id="grid")
#extract data from table using soup
tr_rows = tbl.find_all('tr')
for tr_row in tr_rows[1:]:
td_cells=tr_row.find_all('td')
Trade_mark_name=td_cells[4].text
Trade_mark_office=td_cells[5].text
Designated_territory=td_cells[6].text
Application_number=td_cells[7].text
Registration_number=td_cells[8].text
Trade_mark_status=td_cells[9].text
Trade_mark_type=td_cells[13].text
Applicant_name=td_cells[11].text
Nice_class=td_cells[10].text
Application_date=td_cells[12].text
Registration_date=td_cells[14].text
x=x+1
#Click indiviual links
el=driver.find_elements_by_class_name('cell_tmName_column')[x]
action = webdriver.common.action_chains.ActionChains(driver)
action.move_to_element_with_offset(el, 0, 0)
action.click()
action.perform()
time.sleep(3)
#switch to iframe of tab details
iframe = driver.find_elements_by_tag_name('iframe')[0]
driver.switch_to.frame(iframe)
#get data from iframe
html2= driver.page_source
soup2 = BeautifulSoup(html2,'html.parser')
tblOwner = soup2.find("div", id="anchorOwner").find_next('table')
tblRep = soup2.find("div", id="anchorRepresentative").find_next('table')
# then switch back:
driver.switch_to.default_content()
try:
Owner_Address= tblOwner.find("td", text="Address").find_next('td')
except:
Owner_Address='No Entry'
try:
Representative_Name=tblRep.find("td", text="Name").find_next('td').text.strip()
except:
Representative_Name='No Entry'
records.append((Designated_territory,Applicant_name,Trade_mark_name,Application_date,Application_number,Trade_mark_type, Nice_class,Owner_Address,Trade_mark_office, Registration_number,Trade_mark_status,Registration_date,Representative_Name))
time.sleep(1)
driver.find_elements_by_css_selector( 'a.close_tab')[0].click()
#navigate next page_source
driver.find_element_by_id('next_t_grid_toppager').click()
time.sleep(2)
x=-1
#LOOP
df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office', 'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])
df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8')
except Exception:
df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office', 'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])
df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8')
traceback.print_exc()
time.sleep(5)
driver.quit()
You need to do is to switch_to.frame:
iframe = driver.find_element_by_xpath('//iframe[#id="iframe_0"]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()
EDIT:
You have asked if the id changes what to do so here is an idea you can use contains in your xpath like this:
# this will find any iframe with and id of iframe_
# you should check there is only one, you can do so with: `iframes = driver.find_elements_by_xpath('//iframe[contains(#id,"iframe_")]')`
# than `print(len(iframes))` to see the amount of iframes
iframe = driver.find_element_by_xpath('//iframe[contains(#id,"iframe_")]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()
In your code use:
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
url ="https://www.tmdn.org/tmview/welcome#"
driver = webdriver.Chrome(executable_path=r"D:\New Proj\chromedriver.exe")
driver.get(url)
time.sleep(3)
driver.find_element_by_xpath('//*[#id="buttonBox"]/a').click()
time.sleep(3)
#Click advanced search
driver.find_element_by_name("lnkAdvancedSearch").click()
#
time.sleep(5)
#to select Designated territories
driver.find_element_by_id('DesignatedTerritories').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
for elem in TerritoryLabelElements:
if elem.text == 'United Kingdom':
elem.click()
time.sleep(5)
driver.find_element_by_id('DesignatedTerritories').click()
#
time.sleep(5)
#to select from Trade mark offices
driver.find_element_by_id('SelectedOffices').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'GB United Kingdom ( UKIPO )':
elem.click()
time.sleep(5)
driver.find_element_by_id('SelectedOffices').click()
#Trade mark status
driver.find_element_by_id('TradeMarkStatus').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'Filed':
elem.click()
if elem.text == 'Registered':
elem.click()
time.sleep(5)
driver.find_element_by_id('TradeMarkStatus').click()
# dates
startdate = driver.find_element_by_id("ApplicationDateFrom")
startdate.clear()
startdate.send_keys ('10-01-2018')
enddate = driver.find_element_by_id("ApplicationDateTo")
enddate.clear()
enddate.send_keys ('10-01-2018' )
# click search
time.sleep(5)
driver.find_element_by_id("SearchCopy").click()
time.sleep(30)
#Click first link
el=driver.find_elements_by_class_name('cell_tmName_column')[0]
action = ActionChains(driver)
action.move_to_element_with_offset(el, 0, 0)
action.click()
action.perform()
time.sleep(10)
iframe = driver.find_element_by_xpath('//iframe[#id="iframe_0"]')
driver.switch_to.frame(iframe)
# do something here I am printing the HTML
print(iframe.get_attribute('innerHTML'))
# than switch back:
driver.switch_to.default_content()
Hope this helps you!
Related
I've been trying to flag/report a list of spam comments in a particular YouTube video.
For that I've been using this code on Python, which loads my previous profile so I log in with my account:
URL = "https://www.youtube.com/watch?
v=dvecqwfU6xw&lc=Ugxw_nsUNUor9AUEBGp4AaABAg.9fDfvkgiqtW9fDkE2r6Blm"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
# resp = driver.request('POST', 'https://www.youtube.com/youtubei/v1/flag/get_form?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false')
# print(resp.text)
button = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="button"]')))
driver.execute_script("arguments[0].click();", button)
The problem comes with opening the menu, I believe since you have to hover over the 3 dots menu it would then appear as the clickable menu so I never get to open the actual menu to report/flag the comment.
My mistake was not to take full Xpath path.... It works perfectly like this, THANKS
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
options.add_argument('--headless')
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
option_button = '/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[2]/ytd-comments/ytd-item-section-renderer/div[3]/ytd-comment-thread-renderer[1]/div/ytd-comment-replies-renderer/div[2]/ytd-comment-renderer/div[3]/div[3]/ytd-menu-renderer/yt-icon-button/button'
option_button = wait.until(EC.presence_of_element_located((By.XPATH, option_button)))
driver.execute_script("arguments[0].click();", option_button)
report_button = '/html/body/ytd-app/ytd-popup-container/tp-yt-iron-dropdown/div/ytd-menu-popup-renderer/tp-yt-paper-listbox/ytd-menu-service-item-renderer/tp-yt-paper-item/yt-formatted-string'
report_button = wait.until(EC.presence_of_element_located((By.XPATH,report_button)))
driver.execute_script("arguments[0].click();", report_button)
report_button_spam = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/tp-yt-paper-dialog-scrollable/div/div/yt-options-renderer/div/tp-yt-paper-radio-group/tp-yt-paper-radio-button[1]/div[1]'
report_button_spam = wait.until(EC.presence_of_element_located((By.XPATH, report_button_spam)))
driver.execute_script("arguments[0].click();", report_button_spam)
report_button_send = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/div/yt-button-renderer[2]/a/tp-yt-paper-button'
report_button_send = wait.until(EC.presence_of_element_located((By.XPATH, report_button_send)))
driver.execute_script("arguments[0].click();", report_button_send)
popup_button_done = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog[2]/yt-confirm-dialog-renderer/div[2]/div[2]/yt-button-renderer[3]/a/tp-yt-paper-button'
popup_button_done = wait.until(EC.presence_of_element_located((By.XPATH, popup_button_done)))
print(popup_button_done.text)
I was doing some crawling stuff with selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
instagram_id="username"
instagram_pw="password"
_id = driver.find_element(By.NAME, 'username')
_id.send_keys(instagram_id)
time.sleep(2)
_password = driver.find_element(By.NAME, 'password')
_password.send_keys(instagram_pw)
time.sleep(2)
login_button = driver.find_element(By.CSS_SELECTOR, '.sqdOP.L3NKy.y3zKF').click()
time.sleep(5) #press login button
_keyword = '교토'
driver.get('https://www.instagram.com/explore/tags/' + _keyword + '/') #instagram serch
driver.find_element(By.CSS_SELECTOR, 'div.v1Nh3.kIKUG._bz0w').click()
time.sleep(5) #open first post
There was no problem so far
But in here, NoSuchElementException Error occurs.
results = []
count = 200
for i in range(count):
data = driver.find_elements(By.CSS_SELECTOR, 'a.xil3i') #save hashtag info
for j in range(len(data)):
results.append(data[j].text.replace("#","")) #remove'#'
if (i+1)%10 == 0:
print('{}번째 게시물 완료'.format(i+1))
driver.find_element(By.CSS_SELECTOR, 'a._65Bje.coreSpriteRightPaginationArrow').click() #다음 게시물로 이동
time.sleep(5)
help me fix that error plz.
Thanks
import csv
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from csv import reader
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
chrome_options = Options()
scroll = 5
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
header_added = False
header_added1 = False
url = "url"
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe', options=chrome_options)
driver.maximize_window()
driver.get(url)
time.sleep(3)
search_city = input("Enter the city :")
res_n = input("Enter the Restaurant's name :")
search = driver.find_element_by_xpath('//input[#name="location"]').send_keys(search_city)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="root"]/div[1]/div[1]/div/div[1]/div[1]/div/div[2]/div/div[3]/div[1]/span[2]').click()
time.sleep(3)
driver.find_element_by_xpath('/html/body/div[1]/div[1]/header/div/div/ul/li[5]/div/a/span[1]').click()
time.sleep(1)
search_res = driver.find_element_by_class_name('_2BJMh').send_keys(res_n.lower())
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
try:
driver.find_element_by_class_name('_3FR5S').click()
time.sleep(5)
except:
print("restaurant not open")
driver.quit()
html = driver.find_element_by_tag_name('html')
def get_items():
global header_added
global item_dvs
cats = driver.find_elements_by_class_name('D_TFT')
cats[1].click()
time.sleep(3)
item_dvs = driver.find_elements_by_class_name('_2wg_t')
for div in item_dvs:
name = div.find_element_by_class_name('styles_itemNameText__3bcKX')
print(name.text)
price = div.find_element_by_class_name('rupee')
print(price.text)
if div.find_elements_by_class_name('styles_itemDesc__MTsVd'):
desc = div.find_element_by_class_name('styles_itemDesc__MTsVd').text
else:
desc = None
if div.find_element_by_css_selector('div._1C1Fl._23qjy'):
element = div.find_element_by_css_selector('div._1C1Fl._23qjy')
print("found")
driver.execute_script("arguments[0].scrollIntoView();", element)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
add_ons = driver.find_element_by_class_name('_3UzO2').text
print(add_ons)
driver.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
else:
add_ons = None
dict1 = {'Item Name': name.text, "Price": price.text, "Add Ons :": add_ons, "Description": desc}
with open(f'{search_city}_{res_n}.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
get_items()
The is_cust loop keeps running over and over again opening the same element, while the rest of the code moves on to the next divs. What is wrong here?
xPath are bidirectional and is probably the cause here.
Try this code using cssSelector:
for div in item_dvs:
#Do Something
try:
is_cust = div.find_element_by_css_selector('._1C1Fl._23qjy')
print("found")
except NoSuchElementException:
continue
driver.execute_script("arguments[0].scrollIntoView();", is_cust)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
# Not sure why for this one you had driver instead of div. Suspect div should be
add_ons = div.find_element_by_class_name('_26cJ9').text
div.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
UPDATE
From your updated code, you are using lot of hardcoded sleep. I will suggest to use the WebDriverWait with expected_conditions.
More info here: Wait from Selenium
Imports needed:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Code to be added post driver creation:
wait_time = 5
wait = WebDriverWait(driver, wait_time)
Instead of using sleep like this:
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
Use:
wait.until(EC.presence_of_element_located((By.CLASS_NAME, '_2BJMh'))).send_keys(res_n.lower())
Don't gather the element twice.. use find_elements_by* then validate the length:
descs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'styles_itemDesc__MTsVd')))
if len(descs) > 0:
desc = descs[0].text
else:
desc = None
I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.
I am trying to scrape basic information on google. The code that I am using is the following. Unfortunately it does not move to the next page and I am not figuring the reason why. I am using selenium and google chrome as browser (no firefox). Could you please tell me what is wrong in my code?
driver.get('https://www.google.com/advanced_search?q=google&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('q')
search.send_keys('tea')
search.submit()
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
titles = []
while True:
next_page_btn =driver.find_elements_by_xpath("//a[#id='pnnext']")
for r in result_div:
if len(next_page_btn) <1:
print("no more pages left")
break
else:
try:
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
print(title)
if title != '' :
titles.append(title)
except:
continue
element =WebDriverWait(driver,5).until(expected_conditions.element_to_be_clickable((By.ID,'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
I set q in the query string to be an empty string. Used as_q not q for the search box name. And reordered your code a bit. I put a page limit in to stop it going on forever.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
driver = webdriver.Chrome()
driver.get('https://www.google.com/advanced_search?q=&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('as_q')
search.send_keys('tea')
search.submit()
titles = []
page_limit = 5
page = 0
while True:
soup = BeautifulSoup(driver.page_source, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
for r in result_div:
for title in r.find_all('h3'):
title = title.get_text()
print(title)
titles.append(title)
next_page_btn = driver.find_elements_by_id('pnnext')
if len(next_page_btn) == 0 or page > page_limit:
break
element = WebDriverWait(driver, 5).until(expected_conditions.element_to_be_clickable((By.ID, 'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
page = page + 1
driver.quit()