Scraping Google App All Reviews using Selenium and Python - python

I want to scrape all reviews from google play store for a particular app. I have prepared following script:
# App Reviews Scraper
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.android.chrome&hl=en&showAllReviews=true"
# make request
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
time.sleep(SCROLL_PAUSE_TIME)
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get everything inside <html> tag including javscript
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, 'html.parser')
reviewer = []
date = []
# review text
for span in soup.find_all("span", class_="X43Kjb"):
reviewer.append(span.text)
# review date
for span in soup.find_all("span", class_="p2TkOb"):
date.append(span.text)
print(len(reviewer))
print(len(date))
However, it always showing 203 only. There are 35,474,218 number of reviews. So, how can I download all the reviews?

wait=WebDriverWait(driver,1)
try:
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[text()='Show More']"))).click()
except:
continue
Just add this to check for the show more element in your infinite scroll.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

An easier method to scrape app data from play store
!pip install google_play_scraper
from google_play_scraper import app
#US Market Google play store reviews
from google_play_scraper import Sort, reviews_all
us_reviews = reviews_all(
'add the app id here-using the string mentioned after id value in your code', # use the id from the play
store hyperlink that you have used above
sleep_milliseconds=0, # defaults to 0
lang='en', # defaults to 'en, can change to other lang as well'
country='us', # defaults to 'us'
sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
)
Convert into a data frame
df = pd.DataFrame(np.array(us_reviews ),columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))

I think there's no way to extract all reviews due to Google's limit. For example, com.collectorz.javamobile.android.books app has 2470 reviews and 879 is actually being shown after scrolling to the very end of reviews, which is 64.41% decrease change.
Calculation example:
(879 - 2470)/2470 = -64.41% (64.41% decrease)
In the Chrome dev tools after scrolling to the very end of reviews:
$$(".X5PpBb")
[0 … 99]
[100 … 199]
[200 … 299]
[300 … 399]
[400 … 499]
[500 … 599]
[600 … 699]
[700 … 799]
[800 … 878]
length: 879 👈👈👈
In the new UI there's a Show More button that appears and execution could stop/stuck or throw an error thus fewer reviews.
To extract all available data, you need to check if the See all reviews button is present. The button may be absent if the app has few or no reviews. If the button is present, then you need to click on it and wait until the data is loaded:
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
When the data has loaded, you need to scroll the page. You can make a small change to your page scrolling algorithm. If the variables new_height and old_height are equal, then the program looks for the Show More button selector. If this button exists, then the program is clicking on it and proceed to the next step:
if new_height == old_height:
try:
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
Code and full example in online IDE:
import time, lxml, re, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
URL = "https://play.google.com/store/apps/details?id=com.collectorz.javamobile.android.books&hl=en"
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--lang=en")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=service, options=options)
driver.get(URL)
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
# scrolling
while True:
driver.execute_script("document.querySelector('.fysCi').scrollTo(0, document.querySelector('.fysCi').scrollHeight)")
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
try:
# if "Show More" button present
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
old_height = new_height
# done scrolling
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
user_comments = []
# exctracting comments
for index, comment in enumerate(soup.select(".RHo1pe"), start=1):
comment_likes = comment.select_one(".AJTPZc")
user_comments.append({
"position": index,
"user_name": comment.select_one(".X5PpBb").text,
"user_avatar": comment.select_one(".gSGphe img").get("srcset").replace(" 2x", ""),
"user_comment": comment.select_one(".h3YV2d").text,
"comment_likes": comment_likes.text.split("people")[0].strip() if comment_likes else None,
"app_rating": re.search(r"\d+", comment.select_one(".iXRFPc").get("aria-label")).group(),
"comment_date": comment.select_one(".bp9Aid").text
})
print(json.dumps(user_comments, indent=2, ensure_ascii=False))
If you want to extract reviews much faster, you can use Google Play Product Reviews API from SerpApi. It will bypass blocks from search engines and you don't have to create the parser from scratch and maintain it.
Code example that paginates through all pages and extracts reviews:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import os, json
params = {
# https://docs.python.org/3/library/os.html#os.getenv
'api_key': os.getenv('API_KEY'), # your serpapi api
"engine": "google_play_product", # serpapi parsing engine
"store": "apps", # app results
"gl": "us", # country of the search
"hl": "en", # language of the search
"product_id": "com.collectorz.javamobile.android.books" # app id
}
search = GoogleSearch(params) # where data extraction happens on the backend
reviews = []
while True:
results = search.get_dict() # JSON -> Python dict
for review in results["reviews"]:
reviews.append({
"title": review.get("title"),
"avatar": review.get("avatar"),
"rating": review.get("rating"),
"likes": review.get("likes"),
"date": review.get("date"),
"snippet": review.get("snippet"),
"response": review.get("response")
})
# pagination
if "next" in results.get("serpapi_pagination", {}):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination", {}).get("next")).query)))
else:
break
print(json.dumps(reviews, indent=2, ensure_ascii=False))
There's a Scrape All Google Play App Reviews in Python blog, which in detail shows how to extract all reviews.
Disclaimer, I work for SerpApi.

Related

Reddit Community List using Python

I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.
What I Tried:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)
driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
for comunity in colist:
#getting all the Communities
Name=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
Members=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
Description=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')
# Saving community info
community_info = [Name.text, Members.text, Description.text]
Communities.append(community_info)
driver.quit()
communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)
time.sleep(5)
What I Want:
The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.
Any help will be appreciated.
Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def ger_communities(name: str):
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url = f"https://www.reddit.com/search/?q={name}&type=sr"
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
communities = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
communities.append({
'Name': x.find('h6').get_text(),
'Members': x.find('span').get_text(),
'Description': x.find_all('p')[-1].get_text()
})
return communities
df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)
But i reccomend use Reddit API

Web Scraping shopee.sg with selenium and BeautifulSoup in python

Whenever I am trying to scrape shopee.sg using selenium and BeautifulSoup I am not being able to extract all the data from a single page.
Example - For a search result consisting of 50 products information on the first 15 are getting extracted while the remaining are giving null values.
Now, I know this has got something to do with the scroller but I have no idea how to make it work. Any idea how to fix this?
Code as of now
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
import csv
# create object for chrome options
chrome_options = Options()
#base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
#chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ','+')
#add term query to url
url = template.format(search_term)
#add page query placeholder
url+= '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options = chrome_options)
item_cost = []
item_name = []
url=get_url(search_term)
for page in range(0,3):
driver.get(url.format(page))
delay = 5 #seconds
try:
WebDriverWait(driver, delay)
print ("Page is ready")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
#print(html)
soup = BeautifulSoup(html, "html.parser")
#find the product description
for item_n in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
description_soup = item_n.find('div',{'class':'yQmmFK _1POlWt _36CEnF'})
name = description_soup.text.strip()
except AttributeError:
name = ''
print(name)
item_name.append(name)
# find the price of items
for item_c in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
price_soup = item_c.find('div',{'class':'WTFwws _1lK1eK _5W0f35'})
price_final = price_soup.find('span',{'class':'_29R_un'})
price = price_final.text.strip()
except AttributeError:
price = ''
print(price)
item_cost.append(price)
except TimeoutException:
print ("Loading took too much time!-Try again")
sleep(5)
rows = zip(item_name, item_cost)
with open('shopee_item_list.csv','w',newline='',encoding='utf-8') as f:
writer=csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)```
The issue was that the products that you were trying to scrape load dynamically as you scroll down the page. There may be more elegant solutions than mine, but I implemented a simple javascript scroller, using driver.execute_script (additional resource: https://www.geeksforgeeks.org/execute_script-driver-method-selenium-python)
Scroller
which scrolls to a tenth of the page's height, pauses for 500 milliseconds, and then continues.
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
Additionally, you had two for loops, for item_n in soup.find_all(...), for item_c in soup.find_all(...) that were iterating over divs in the same class. I fixed that, in my code, so that you can get both the price and the name of each item while only using one for loop.
You also had try-except statements (in case there was an AttributeError, i.e. if the items you were finding in soup.find_all were NoneTypes). I simplified those into if statements, like this one
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
And finally, you were using zip for two different lists (names and prices), to add to a csv file. I combined those individual lists into a nested list in the for loop, instead of appending to two separate lists and zipping at the end. This saves a step, though it is optional and may not be what you need.
Full (updated) code
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
from time import sleep
# create object for chrome options
chrome_options = Options()
# base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
# chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ', '+')
# add term query to url
url = template.format(search_term)
# add page query placeholder
url += '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options=chrome_options)
rows = []
url = get_url(search_term)
for page in range(0, 3):
driver.get(url.format(page))
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', {'class': 'col-xs-2-4 shopee-search-item-result__item'}):
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
price = item.find('div', {'class': 'WTFwws _1lK1eK _5W0f35'})
if price is not None:
price = price.find('span', {'class': '_29R_un'}).text.strip()
else:
price = ''
print([name, price])
rows.append([name, price])
with open('shopee_item_list.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)

Beautiful soup parsing of breakdown list

I need to get data/string from finance yahoo. However, relevant information is "hidden" under breakdown list.
As you see, I can access other data, e.g. total revenue, cost of revenue. Problem occurs when I try to access data hidden under breakdown list - Current Assets, Inventory (which is under Total Assets and Current Assets sections).
Python raises AttributeError: 'NoneType' object has no attribute 'find_next' error which I do not find illustrative.
P.S. I found that problem are these elements by commenting out each line
import urllib.request as url
from bs4 import BeautifulSoup
company = input('enter companies abbreviation')
income_page = 'https://finance.yahoo.com/quote/' + company + '/financials/'
balance_page = 'https://finance.yahoo.com/quote/' + company + '/balance-sheet/'
set_income_page = url.urlopen(income_page).read()
set_balance_page = url.urlopen(balance_page).read()
soup_income = BeautifulSoup(set_income_page, 'html.parser')
soup_balance = BeautifulSoup(set_balance_page, 'html.parser')
revenue_element = soup_income.find('span', string='Total Revenue').find_next('span').text
cogs_element = soup_income.find('span', string='Cost of Revenue').find_next('span').text
ebit_element = soup_income.find('span', string='Operating Income').find_next('span').text
net_element = soup_income.find('span', string='Pretax Income').find_next('span').text
short_assets_element = soup_balance.find('span', string='Current Assets').find_next('span').text
inventory_element = soup_balance.find('span', string='Inventory').find_next('span').text
Here is an example of parsing this web page using selenium. It allows emulate user behavior: wait till page is loaded, close pop-up, extend treenode by click it and extract some information from it.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
company = input('enter companies abbreviation: ')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
wd = webdriver.Chrome('<<PATH_TO_CHROMEDRIVER>>', options=chrome_options)
# delay (how long selenium waits for element to be loaded)
DELAY = 30
# maximize browser window
wd.maximize_window()
# load page via selenium
wd.get('https://finance.yahoo.com/quote/' + company + '/financials/')
# check for popup, close it
try:
btn = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.XPATH, '//button[text()="I agree"]')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
except:
pass
# wait for page to load
results = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.ID, 'Col1-1-Financials-Proxy')))
# parse content
soup_income = BeautifulSoup(results.get_attribute('innerHTML'), 'html.parser')
# extract values
revenue_element = soup_income.find('span', string='Total Revenue').find_next('span').text
cogs_element = soup_income.find('span', string='Cost of Revenue').find_next('span').text
ebit_element = soup_income.find('span', string='Operating Income').find_next('span').text
net_element = soup_income.find('span', string='Pretax Income').find_next('span').text
# load page via selenium
wd.get('https://finance.yahoo.com/quote/' + company + '/balance-sheet/')
# wait for page to load
results = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.ID, 'Col1-1-Financials-Proxy')))
# expand total assets
btn = WebDriverWait(wd, DELAY).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Total Assets"]/preceding-sibling::button')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
# expand inventory
btn = WebDriverWait(wd, DELAY).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Current Assets"]/preceding-sibling::button')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
# parse content
soup_balance = BeautifulSoup(results.get_attribute('innerHTML'), 'html.parser')
# extract values
short_assets_element = soup_balance.find('span', string='Current Assets').find_next('span').text
inventory_element = soup_balance.find('span', string='Inventory').find_next('span').text
# close webdriver
wd.quit()
print(revenue_element)
print(cogs_element)
print(ebit_element)
print(net_element)
print(short_assets_element)
print(inventory_element)

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

Python - Selenium next page

I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion

Categories