Python Selenium: extraction of rating given by individual reviewer

Python Selenium: extraction of rating given by individual reviewer - python

I am trying to extract google reviews of a resturant using Python Selenium. I tried to extract the reviews posted by each reviewers. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome('')
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
#all_reviews = driver.find_elements_by_css_selector('div.gws-localreviews__google-review')
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=5
review_info = driver.find_elements_by_xpath("//div[#class='PuaHbe']")
for person in person_infos:
rating = person.find_element_by_xpath("./span").get_attribute('aria-label')
print(rating)
However, the above code produces/print 'none'. I am not sure where I made the mistake. Any help to fix the issue would be appreciated.

You are using a wrong XPath locator.
Instead of
rating = person.find_element_by_xpath("./span").get_attribute('aria-label')
Try using
rating = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')

Related

How to find 'Text' after node?

driver = webdriver.Chrome()
URL= ['https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron']
driver.get(URL)
sleep(1)
des = driver.find_element_by_xpath('//div[#class="product-item-description"]//strong/following sibling::text()[1]')
print(des)
I expect my result as 'Gỗ tự nhiên', I have tried many ways but couldn't get the text after 'Chất liệu:'.

You can take the entire span text using .get_attribute('innerText') and then use the split function from Python like below:
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron")
time.sleep(1)
entire_span = wait.until(EC.visibility_of_element_located((By.XPATH, "//strong[text()='Chất liệu:']/..")))
entire_span_splitted = entire_span.get_attribute('innerText').split(":")
#print(entire_span_splitted[0])
print(entire_span_splitted[1])
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output:
Gỗ tự nhiên.

How to scrap each product page (comments and custumer country)

I am trying to scrape each product page from aliexpress website in order to get number of comments, number of photos published by the custumer and also the custumer country and put it to a dataframe.
I have written a code that scrape custumer country but I don't know how to get the number of custumer comments and the number of images.
This is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
I would appreciate any help from you! Thank you !

If you want numbers of comments / reviews, you can just check the value in this section :
driver.find_element(By.XPATH, 'XPATH_OF_ELEMENT_TO_SCRAP')
To do so in your exemple lets do this outside your loop :
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
If you dont understand or know this function, please feal free to ask and I will explain where I found theses XPATH.We also can use find by id function.
In your code it would be :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
print(f'number_feedbacks = {number_feedbacks}\nnumber_images = {number_images}')
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

how to get product price from different size selenium python

I want to scrape product information from this page . This product have three different size and price will be change if I select different size from drop-down section. Right now my scraper only can scrape default price after first time initially page load which is 35 for 1kg. How I will scrape price for 500g and 250g. here is my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
#argument for incognito Chrome
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://boutique.cafebarista.ca/products/cremone?variant=18033418797121")
product_title = browser.find_element_by_xpath('//h1[#class="product-name"]')
long_description = browser.find_element_by_xpath('//div[#class="product-landing-container"]')
price=browser.find_element_by_xpath('//div[#class="product-btn-price ProductPrice"]')
print(product_title.text,long_description.text,price.text)
browser.quit()

With .find_elements_by_css_selector you can get each text without clicking the weight drop down first, this is the selector I mean:
nav[id="w-dropdown-list-16"] > a > div
And you can also click on each of these elements using .execute_script
Try following code:
driver.get('https://boutique.cafebarista.ca/products/autentico?variant=18033459331137')
weight_list = driver.find_elements_by_css_selector('nav[id="w-dropdown-list-16"] > a > div')
for weight in weight_list:
driver.execute_script('arguments[0].click();', weight)
price = driver.find_element_by_id('ProductPrice').text
print(weight.get_attribute('innerHTML') +' ' +price)

Try below solution
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
browser = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
browser.maximize_window()
wait = WebDriverWait(browser, 20)
browser.get("https://boutique.cafebarista.ca/products/cremone?variant=18033418797121")
kg_button=browser.find_element_by_xpath("//div[#id='w-dropdown-toggle-16']")
kg_button.click()
list =wait.until(EC.presence_of_all_elements_located((By.XPATH, "//nav[#id='w-dropdown-list-16']//a")))
kg_button.click()
for element in list:
kg_button.click()
actionChains = ActionChains(browser)
actionChains.move_to_element(element).click().perform()
price = browser.find_element_by_xpath("//div[#id='ProductPrice']")
print product_title.text
print element.text
print price.text
browser.quit()

How do I extract a list of URLs off a page with selenium?

I am attempting to extract all urls that https://shop.freedommobile.ca/devices has when you click the 'see options' button under each phone and place them into a list of strings.
I am using python with Selenium and wait libraries.
Ive already tried using .text in my parameters. However, I keep running into an error that states:
typeError: 'str' object is not callable
line 17 is the issue.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
class phoneCost:
driver.get("https://shop.freedommobile.ca/devices")
# extract the names of the phones
wait = WebDriverWait(driver, 20) #10 second wait
XPathLocation = """B//*[#id="skip-navigation"]/div/div/div[1]/div/div[2]/a'"""
phonePlanLinksRaw = wait.until(EC.presence_of_all_elements_located(By.XPATH(XPathLocation)))
phonePlanLinks = []
for element in range(len(phonePlanLinksRaw)):
link = element
phonePlanLinks.append(str(link))
numLink = 1
for element in range(len(phonePlanLinks)):
print("phone " + str(numLink) + " : " + phonePlanLinks[element])
numLink += 1
should return a list of urls in string format:
[https://shop.freedommobile.ca/devices/Apple/iPhone_XS_Max?sku=190198786135&planSku=Freedom%20Big%20Gig%2015GB
,
https://shop.freedommobile.ca/devices/Apple/iPhone_XS?sku=190198790569&planSku=Freedom%20Big%20Gig%
,
https://shop.freedommobile.ca/devices/Apple/iPhone_XR?sku=190198776631&planSku=Freedom%20Big%20Gig%2015GB]
Any help is appreciated
Thank you

Here is the logic that you should use.
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//div[starts-with(#class,'deviceListItem')]/a")))
mblOptions = driver.find_elements_by_xpath("//div[starts-with(#class,'deviceListItem')]/a")
mblUrls = []
for mblOption in mblOptions:
mblUrls.append(mblOption.get_attribute('href'))
print (mblUrls)
output:
['https://shop.freedommobile.ca/devices/Apple/iPhone_XS_Max?sku=190198786135&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_XS?sku=190198790569&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_XR?sku=190198776631&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_8_Plus?sku=190198454249&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_8?sku=190198450944&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10+?sku=887276301570&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10?sku=887276312163&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10e?sku=887276313870&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_Tab_A_8_LTE?sku=887276299440&planSku=Promo%20Tablet%2015', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_Note9?sku=887276279916&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S9?sku=887276250861&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Motorola/G7_Power?sku=723755134249&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Motorola/Moto_E5_Play?sku=723755125940&planSku=Freedom%20LTE%2B3G%209.5GB%20Promo', 'https://shop.freedommobile.ca/devices/Google/Pixel_3a?sku=842776111326&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Google/Pixel_3?sku=842776109798&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/Google/Pixel_3_XL?sku=842776109828&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/ZTE/Z557?sku=885913107448&planSku=Freedom%20500MB', 'https://shop.freedommobile.ca/devices/LG/G7_ThinQ?sku=652810830737&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Huawei/P30_lite?sku=886598061131&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Huawei/Mate_20_Pro?sku=886598058964&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/LG/X_Power_3?sku=652810831130&planSku=Freedom%20LTE%2B3G%209.5GB%20Promo', 'https://shop.freedommobile.ca/devices/LG/G8_ThinQ?sku=652810832434&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/LG/Q_Stylo_+?sku=652810831222&planSku=Freedom%202GB', 'https://shop.freedommobile.ca/devices/Alcatel/GoFLIP?sku=889063504010&planSku=Freedom%20500MB', 'https://shop.freedommobile.ca/devices/Bring_Your/Own_Device?sku=byod']

Try using list comprehension to achieve the reults. Just take a look at this portion (By.XPATH(XPathLocation))) that you used which should be wait.until(EC.visibility_of_all_elements_located((By.XPATH, "some_xpath"))).
Rectified one is more like:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver, 10)
driver.get("https://shop.freedommobile.ca/devices")
item_links = [item.get_attribute("href") for item in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#class,'__DeviceDetailsButton')]")))]
print(item_links)

To extract all urls that https://shop.freedommobile.ca/devices has using Selenium you have to induce WebDriverWait for the visibility_of_all_elements_located() and you can use the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
# options.add_argument('disable-infobars')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://shop.freedommobile.ca/devices")
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//a[text()='See Options']")))])
Console Output:
['https://shop.freedommobile.ca/devices/Apple/iPhone_XS_Max?sku=190198786135&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_XS?sku=190198790569&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_XR?sku=190198776631&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_8_Plus?sku=190198454249&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Apple/iPhone_8?sku=190198450944&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10+?sku=887276301570&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10?sku=887276312163&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S10e?sku=887276313870&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_Tab_A_8_LTE?sku=887276299440&planSku=Promo%20Tablet%2015', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_Note9?sku=887276279916&planSku=Freedom%20Big%20Gig%2015GB', 'https://shop.freedommobile.ca/devices/Samsung/Galaxy_S9?sku=887276250861&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Motorola/G7_Power?sku=723755134249&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Motorola/Moto_E5_Play?sku=723755125940&planSku=Freedom%20LTE%2B3G%209.5GB%20Promo', 'https://shop.freedommobile.ca/devices/Google/Pixel_3a?sku=842776111326&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Google/Pixel_3?sku=842776109798&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/Google/Pixel_3_XL?sku=842776109828&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/ZTE/Z557?sku=885913107448&planSku=Freedom%20500MB', 'https://shop.freedommobile.ca/devices/LG/G7_ThinQ?sku=652810830737&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Huawei/P30_lite?sku=886598061131&planSku=Freedom%20Big%20Gig%20%2B%20Talk%205GB', 'https://shop.freedommobile.ca/devices/Huawei/Mate_20_Pro?sku=886598058964&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/LG/X_Power_3?sku=652810831130&planSku=Freedom%20LTE%2B3G%209.5GB%20Promo', 'https://shop.freedommobile.ca/devices/LG/G8_ThinQ?sku=652810832434&planSku=Freedom%20Big%20Gig%20%2B%20Talk%2010GB', 'https://shop.freedommobile.ca/devices/LG/Q_Stylo_+?sku=652810831222&planSku=Freedom%202GB', 'https://shop.freedommobile.ca/devices/Alcatel/GoFLIP?sku=889063504010&planSku=Freedom%20500MB', 'https://shop.freedommobile.ca/devices/Bring_Your/Own_Device?sku=byod']

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Selenium: extraction of rating given by individual reviewer - python

You are using a wrong XPath locator. Instead of rating = person.find_element_by_xpath("./span").get_attribute('aria-label') Try using rating = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')

Related

How to find 'Text' after node?

How to scrap each product page (comments and custumer country)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

how to get product price from different size selenium python

How do I extract a list of URLs off a page with selenium?

Categories

Resources