I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?
EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
I am trying to scrape data from a website using Selenium, I am able to send values but not receiving the result to start scraping. The program is also not throwing any errors. Here's the reproducible code:
# Setting up driver
driver = webdriver.Chrome(executable_path='D:\Program Files\ChromeDriver\chromedriver.exe')
# Opening up the webpage
driver.get("https://www1.nseindia.com/products/content/derivatives/equities/historical_fo.htm")
# Setting the value of Select Instrument
driver.find_element_by_id('instrumentType').send_keys('Index Options')
# Setting the value of Select Symbol
driver.find_element_by_id('symbol').send_keys('NIFTY 50')
# Setting the value of Select Year
driver.find_element_by_id('year').send_keys('2019')
# Setting the value of Select Expiry
select = Select(driver.find_element_by_id('expiryDate'))
noOfExpiries = 2
select.select_by_index(noOfExpiries)
# Setting the value of Select Option Type
cycle = 'PE'
driver.find_element_by_id('optionType').send_keys(cycle)
# Clicking the date range radio button
driver.find_element_by_xpath("//input[#id='rdDateToDate']").click()
# Setting the date range
fromDate = (datetime.strptime(select.options[noOfExpiries].text, '%d-%m-%Y') - timedelta(days=45)).strftime("%d-%b-%Y")
driver.find_element_by_id('fromDate').send_keys(fromDate)
toDate = datetime.strptime(select.options[noOfExpiries].text, '%d-%m-%Y').strftime("%d-%b-%Y")
driver.find_element_by_id('toDate').send_keys(toDate)
print(fromDate, toDate)
# Clicking the Get Data button
driver.find_element_by_id('getButton').click()
Any clue as to what am I missing here?
Okay, this should do it. I came up with selenium based solution only because you expressed disinterest in requests module:
from selenium import webdriver
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = ChromeOptions()
options.add_argument('disable-blink-features=AutomationControlled')
with webdriver.Chrome(options=options) as driver:
wait = WebDriverWait(driver,10)
driver.get("https://www1.nseindia.com/products/content/derivatives/equities/historical_fo.htm")
wait.until(EC.presence_of_element_located((By.ID, "instrumentType"))).send_keys('Index Options')
wait.until(EC.presence_of_element_located((By.ID, "symbol"))).send_keys('NIFTY 50')
wait.until(EC.presence_of_element_located((By.ID, "year"))).send_keys('2019')
select = Select(wait.until(EC.presence_of_element_located((By.ID, "expiryDate"))))
noOfExpiries = 13
select.select_by_index(noOfExpiries)
cycle = 'PE'
wait.until(EC.presence_of_element_located((By.ID, "optionType"))).send_keys(cycle)
item = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#id='rdDateToDate']")))
driver.execute_script("arguments[0].click();",item)
fromDate = (datetime.strptime(select.options[noOfExpiries].text, '%d-%m-%Y') - timedelta(days=45)).strftime("%d-%b-%Y")
wait.until(EC.presence_of_element_located((By.ID, "fromDate"))).send_keys(fromDate)
toDate = datetime.strptime(select.options[noOfExpiries].text, '%d-%m-%Y').strftime("%d-%b-%Y")
wait.until(EC.presence_of_element_located((By.ID, "toDate"))).send_keys(toDate,Keys.ENTER)
print(fromDate, toDate)
search_button = wait.until(EC.presence_of_element_located((By.ID, "getButton")))
driver.execute_script("arguments[0].click();",search_button)
tabular_content = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#historicalData > table"))).get_attribute("innerHTML")
print(tabular_content)
I had posted in Stack Exchange earlier; however, did not get much response from that yet; hence, posting it here.
I am trying to scrape some data using the following code. When I run the code line by line, it works fine. However, when I want to run all code at one go, the dropdown options go blank and as a result, the last line returns error. Your help would be much appreciated. The code is below.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
browser.find_element_by_link_text("BIHAR").click()
browser.implicitly_wait(5)
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlFin")
elem2.send_keys(year[0])
browser.implicitly_wait(5)
select_dist = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist")
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
for e in range(len(options)):
select_dist = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist"))
select_dist.select_by_index(e)
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
options1 = select_block.options
for f in range(len(options1)):
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
select_block.select_by_index(f)
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
options2 = select_gp.options
for g in range(len(options2)):
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
select_gp.select_by_index(g)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_rbLoginLevel_1").click()
browser.implicitly_wait(10)
elem6 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodFrom")
elem6.send_keys('01/04/2016')
browser.implicitly_wait(10)
elem7 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodTo")
elem7.send_keys('31/03/2017')
browser.implicitly_wait(10)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_login").click()
browser.implicitly_wait(10)
browser.find_element_by_link_text("Download All Reports").click()
Besides that the target page is slower than an aged snail, and those 10 second waits are barely enough for anything, there are two things you missed an those caused your troubles:
you did not take account that the first element of the select options are "select an option" types. So if you try to cycle trough all of them, you must ignore the option at the first index, else it will look like "nothing is selected"
wait for that spinner. After the spinner is gone, page will be refreshed. Do not grab the elements before page refresh is complete, wait until the spinner is gone.
With these two helper functions it is possible to press the "Get Reports" button without issues:
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
If you get your elements via the wait_for_element call then you'll be able to interact with them without error. I guess you know that pressing that button is not the end of the road yet, you'll have to choose the report format and who knows what later on.
Adjusted code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
import time
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
start = time.time()
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
loaded = time.time()
print(f'PAGE LOADED IN {loaded-start} seconds')
browser.find_element_by_link_text("BIHAR").click()
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlFin"]')
selector_page_loaded = time.time()
print(f'WORK AREA LOADED IN {selector_page_loaded-loaded} seconds')
elem2.send_keys(year[0])
select_dist = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]')
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
# ISSUE: default fields are included in the options!
for e in range(1,len(options)):
select_dist = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]'))
select_dist.select_by_index(e)
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
options1 = select_block.options
for f in range(1, len(options1)):
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
select_block.select_by_index(f)
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
options2 = select_gp.options
for g in range(1, len(options2)):
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
select_gp.select_by_index(g)
wait_for_element('//*[#id="ctl00_ContentPlaceHolder1_rbLoginLevel_1"]').click()
elem6 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodFrom"]')
elem6.send_keys('01/04/2016')
elem7 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodTo"]')
elem7.send_keys('31/03/2017')
wait_for_element('//*[#value="Get Reports"]').click()
print(f'FIRST RUN IN {time.time()-selector_page_loaded}')
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()
I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:
I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link
You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on
I would say you just need to put this line:
pdf_link = elm.get_attribute("href")
Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))