Extracting reviews from Google play store app website

Extracting reviews from Google play store app website - python

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()

I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

How click the parent class if the child href matches the requirement

Hi I've tired to find the right selenium code to get click the main parent class if the following requirements exist in the class :
Parent Class
<div class ="col-xs-2-4 shopee-search-item-result__item" data-sqe="item">
Child class
<a data-sqe="link" href= all urls that is printed in python.>
Child class contains this element
<div class = "_1gkBDw _2O43P5">
<div class = "_1HvBLA">
<div class = "_3ao649" data-sqe="ad"> Ad</div>
Here is the code bellow
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
time.sleep(0.8)
# select language
driver.find_element_by_xpath('//div[#class="language-selection__list"]/button').click()
time.sleep(3)
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items = driver.find_elements_by_xpath('//a[#data-sqe="link"]')
print('len:', len(all_items))
all_urls = []
j = 0
k = 45
for item in all_items:
url = item.get_attribute('href')
all_urls.append(url)
print(all_urls)
a= len(all_urls)
# now use links
i = 0
while i <= 4 :
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='col-xs-2-4 shopee-search-item-result__item' and #data-sqe='item']//a[#class='link' and #href= all_urls[i]]"))).click()
i+=1
I've tried to locate:
-Div the whole class
-locate classes and the href individualy
-click the first five columns
but it all always fails.
Traceback (most recent call last):
File "E:/Users/Asashin/Desktop/Bots/click test 7.py", line 52, in <module>
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='col-xs-2-4 shopee-search-item-result__item' and #data-sqe='item']//a[#class='link' and #href= all_urls[i]]"))).click()
File "C:\Users\User\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Can I be solved?

I have made couple of changes.
When you are fetching href values you are getting complete url and not the url you are seeing in DOM so you need to remove the preceding values in order to verify later.
In the last while loop all_urls[i] is variable you need passed it as variable not string.
Once you click each link you need to come back to the parent page again by using driver.back()
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
# select language
WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//div[#class="language-selection__list"]/button'))).click()
time.sleep(3)
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items = driver.find_elements_by_xpath('//a[#data-sqe="link"]')
print('len:', len(all_items))
all_urls = []
j = 0
k = 45
for item in all_items:
# This give you whole url of the anchor tag
url = item.get_attribute('href')
# You need to remove the preceding values in order to verify href later for clicking
urlfinal=url.split('https://shopee.com.my')[1]
all_urls.append(urlfinal)
print(all_urls)
a= len(all_urls)
# now use links
i = 0
while i <= 4 :
#Identify the parent tag by child tag use following Xpath.
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='col-xs-2-4 shopee-search-item-result__item' and #data-sqe='item'][.//a[#data-sqe='link' and #href='" + all_urls[i] +"']]"))).click()
driver.back()
i+=1

How to get parent element:
child_element = driver.find_element_by_xpath('//a[#data-sqe="link"]')
parent_element = child_element.find_element_by_xpath('./ancestor::div[contains(#class, "shopee-search-item-result__item")][1]')
How to get element with specific child:
element = driver.find_element_by_xpath('div[contains(#class, "shopee-search-item-result__item") and .//a[#data-sqe="link"]]')

Python Selenium wedriver iterate over form

I'm trying to automate a form assertion with a list of data, however I'm struggling on when and how to use "WebDriverWait" or driver implict wait. My list is 1000 strings. When I run a sample of a 100, less than 100 are captured correctly. The code below catches ElementNotSelectableException\StaleElementReferenceException but doesn't address them correctly.
import unittest
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, \
ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome()
oar_order_id = '#oar-order-id'
oar_zip_code = '#oar_zip'
#order confirmation list
order_id_list = ["WO-34284975","WO-50002804","WO-50004302","WO-34282964","WO-34214963"]
#zip code confirmation list
zip_code_list = ["23508","99338","62036","75074-7520","37763","98034","89406-4361"]
submit_button = 'body.sales-guest-form.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.columns:nth-child(4) div.column.main form.form.form-orders-search:nth-child(4) div.actions-toolbar div.primary > button.action.submit.primary'
# Enter orderid & Zip & click enter
found = []
notfound = []
length = len(order_id_list)
driver.implicitly_wait(10)
wait = WebDriverWait(driver, 15, poll_frequency=1,
ignored_exceptions=[ElementNotVisibleException ,ElementNotSelectableException])
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
# Sample of 10 from list
for i in range(10):
try:
wait
element1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, oar_order_id)))
element1.send_keys(order_id_list[i])
element2 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, oar_zip_code)))
element2.send_keys(zip_code_list[i])
element3 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, submit_button)))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, submit_button)))
element3.click()
wait
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"body.sales-guest-view.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.page-title-wrapper:nth-child(3) h1.page-title > span.base")))
title = driver.title
#driver.implicitly_wait(60)
except(StaleElementReferenceException):
print("StaleElementReferenceException")
except(ElementNotInteractableException):
print("ElementNotInteractableException")
try:
text = order_id_list[i]
assert(title.endswith(text[3:]))
found.append(text)
except(AssertionError):
notfound.append((order_id_list[i]))
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
wait
print(found,notfound)```

I found an answer that helped me. I used: time.sleep(.500) after driver.get('https:/www.ecklers.com/sales/orderlookup/index/'

Unable to parse data inside h4 tag: Python3

I am facing an issue while parsing data from the 'Literature ' tab from the third table. The steps I took to reach the table:
Go to ibl.mdanderson.org/fasmic/#!
Type and select AKT1 (3 mutations) (NOTE:'GO' button doesn't work, please click the option from the drop-down)
Click on the green button with the text 'MS', a new table will appear.
In this new table, there will be a tab called literature, I need the literature text and the PMID.
I tried the following code, but it gives an empty list:
xyz= driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
for elements in driver.find_elements_by_xpath('//div[#class="tab-pane ng-scope active"]'):
soup = BeautifulSoup(driver.page_source, 'lxml')
table = soup.find('div', attrs={'id': "literature_div"})
table_body = table.find('h4')
rows = table.find_all('h4')
for row in rows:
cols = row.find_all('h4')
# cols = [ele.text.strip() for ele in cols]
litrature.append([ele for ele in cols if ele]) # Get rid of empty value
print("Data from COLUMN 1:")
print(litrature)
How can I resolve this?
UPDATE
When I try to click on the 'Next ' button under the 'literature' table, I get the following error:
"Message: The element reference of is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed "
Following is the line I added to click on the "NEXT" buton: driver.find_element_by_xpath('//a[#ng-click="selectPage(page + 1, $event)"]').click()
How can I resolve this?

you need to wait 3 times
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://ibl.mdanderson.org/fasmic/#!/')
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//input')))
input = driver.find_element_by_xpath("//input")
input.send_keys("AKT1\n")
button = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME , 'btn-tab-avail')))
button.click()
driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#literature_div h4')))
rows = driver.find_elements_by_css_selector("#literature_div h4")
litrature = []
for item in rows:
item = item.text
litrature.append(item)
print("Data from COLUMN 1:")
print item

Like this? Someone with more knowledge of python waits can certainly improve on my wait lines.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
url = "https://ibl.mdanderson.org/fasmic/#!/"
d = webdriver.Chrome()
wait = WebDriverWait(d, 10)
d.get(url)
d.find_element_by_css_selector('[type=text]').send_keys('AKT1 (3 mutations)')
d.find_element_by_css_selector("input[type='text']").send_keys(Keys.RETURN)
btn = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".btn.btn-default.btn-tab-avail")))
btn.click()
d.find_element_by_css_selector("[heading=Literature]").click()
ele = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#literature_div [ng-repeat]"), "PMID"))
eles = d.find_elements_by_css_selector("#literature_div [ng-repeat]")
for item in eles:
print(item.text,"\n")
d.quit()

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2

You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting reviews from Google play store app website - python

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

How click the parent class if the child href matches the requirement

Python Selenium wedriver iterate over form

Unable to parse data inside h4 tag: Python3

Parsing a site where URL doesn't change with Selenium Python

Categories

Resources