How to scrap each product page (comments and custumer country) - python

I am trying to scrape each product page from aliexpress website in order to get number of comments, number of photos published by the custumer and also the custumer country and put it to a dataframe.
I have written a code that scrape custumer country but I don't know how to get the number of custumer comments and the number of images.
This is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
I would appreciate any help from you! Thank you !

If you want numbers of comments / reviews, you can just check the value in this section :
driver.find_element(By.XPATH, 'XPATH_OF_ELEMENT_TO_SCRAP')
To do so in your exemple lets do this outside your loop :
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
If you dont understand or know this function, please feal free to ask and I will explain where I found theses XPATH.We also can use find by id function.
In your code it would be :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
print(f'number_feedbacks = {number_feedbacks}\nnumber_images = {number_images}')
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

Related

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.
To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))
The problem there is that the selector depends on whether the window is expanded or not

Python Selenium: extraction of rating given by individual reviewer

I am trying to extract google reviews of a resturant using Python Selenium. I tried to extract the reviews posted by each reviewers. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome('')
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
#all_reviews = driver.find_elements_by_css_selector('div.gws-localreviews__google-review')
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=5
review_info = driver.find_elements_by_xpath("//div[#class='PuaHbe']")
for person in person_infos:
rating = person.find_element_by_xpath("./span").get_attribute('aria-label')
print(rating)
However, the above code produces/print 'none'. I am not sure where I made the mistake. Any help to fix the issue would be appreciated.
You are using a wrong XPath locator.
Instead of
rating = person.find_element_by_xpath("./span").get_attribute('aria-label')
Try using
rating = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')

Extracting reviews from Google play store app website

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()
I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:

Python Selenium wedriver iterate over form

I'm trying to automate a form assertion with a list of data, however I'm struggling on when and how to use "WebDriverWait" or driver implict wait. My list is 1000 strings. When I run a sample of a 100, less than 100 are captured correctly. The code below catches ElementNotSelectableException\StaleElementReferenceException but doesn't address them correctly.
import unittest
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, \
ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome()
oar_order_id = '#oar-order-id'
oar_zip_code = '#oar_zip'
#order confirmation list
order_id_list = ["WO-34284975","WO-50002804","WO-50004302","WO-34282964","WO-34214963"]
#zip code confirmation list
zip_code_list = ["23508","99338","62036","75074-7520","37763","98034","89406-4361"]
submit_button = 'body.sales-guest-form.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.columns:nth-child(4) div.column.main form.form.form-orders-search:nth-child(4) div.actions-toolbar div.primary > button.action.submit.primary'
# Enter orderid & Zip & click enter
found = []
notfound = []
length = len(order_id_list)
driver.implicitly_wait(10)
wait = WebDriverWait(driver, 15, poll_frequency=1,
ignored_exceptions=[ElementNotVisibleException ,ElementNotSelectableException])
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
# Sample of 10 from list
for i in range(10):
try:
wait
element1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, oar_order_id)))
element1.send_keys(order_id_list[i])
element2 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, oar_zip_code)))
element2.send_keys(zip_code_list[i])
element3 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, submit_button)))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, submit_button)))
element3.click()
wait
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"body.sales-guest-view.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.page-title-wrapper:nth-child(3) h1.page-title > span.base")))
title = driver.title
#driver.implicitly_wait(60)
except(StaleElementReferenceException):
print("StaleElementReferenceException")
except(ElementNotInteractableException):
print("ElementNotInteractableException")
try:
text = order_id_list[i]
assert(title.endswith(text[3:]))
found.append(text)
except(AssertionError):
notfound.append((order_id_list[i]))
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
wait
print(found,notfound)```
I found an answer that helped me. I used: time.sleep(.500) after driver.get('https:/www.ecklers.com/sales/orderlookup/index/'

Create a loop for web scraping with selenium in python

I want to create a loop so that I can scrape the individual time figures for each horse on all eight races from the at the races website.
Below is an example of the first race (17:15) of eight:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
url = 'http://www.attheraces.com/racecard/Wolverhampton/6-October-2018/1715'
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_element_by_xpath('//*[#id="racecard-tabs 1061960"]/div[1]/div/div[1]/ul/li[2]/a').click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
The next race (17:45) would have the following url:
url = 'http://www.attheraces.com/racecard/Wolverhampton/6-October-2018/1745'
And the id in the following code keeps changing with the url
driver.find_element_by_xpath('//*[#id="racecard-tabs 1061961"]/div[1]/div/div[1]/ul/li[2]/a').click()
So for the 17:15, the racecard-tabs becomes 1061960
for the 17:45, the racecard-tabs becomes 1061961
the 18:15, raecard-tabs becomes 1061963, and so on.
Any help or advice is greatly appreciated.
This will work. You can even change the date and the rest will be automatically be executed for you!
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
def main():
date = '6-October-2018'
main_url = 'http://www.attheraces.com/racecard/Wolverhampton/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()

Categories