How to check if further `scroll down` is not possible using Selenium

How to check if further `scroll down` is not possible using Selenium - python

Am using Selenium + python to scrap a page which has infinite scroll (basically scroll till max first 500 results are shown)
Using below code, am able to scroll to bottom of the page. Now i want to stop when further scrolling doesn't fetches any content. (say, page only has 200 results, i don't want to keep on scrolling assuming max 500 result)
driver = webdriver.Firefox()
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
I tried accessing window.pageYOffset but it's coming as None always.

I'm using Selenium with Chrome, not Firefox, but the following worked for me:
capture page height before scrolling down;
scroll down using key down;
capture page height after scrolling down;
if page height was same before and after scrolling, stop scrolling
My code looks like this:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
driver.get("www.yourTargetURL.com")
reached_page_end = False
last_height = driver.execute_script("return document.body.scrollHeight")
while not reached_page_end:
driver.find_element_by_xpath('//body').send_keys(Keys.END)
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if last_height == new_height:
reached_page_end = True
else:
last_height = new_height
driver.quit()

You can check document.body.scrollTop by before and after each scroll attempt if there is no data to fetch then this value will stay the same
distanceToTop = driver.execute_script("return document.body.scrollTop);")

Just in case, if someone is using playwright. This code snippet is very similar to the ATJ answer.
import time
from playwright.sync_api import sync_playwright
def run(playwright):
page = playwright.chromium.launch(headless=False).new_page()
page.goto("URL")
reached_end = False
last_height = page.evaluate("() => document.body.scrollHeight") # scrollHeight: 5879
while not reached_end:
page.keyboard.press("End")
time.sleep(2)
new_height = page.evaluate("() => document.body.scrollHeight")
if new_height == last_height:
reached_end = True
else:
last_height = new_height
page.close()
with sync_playwright() as playwright:
run(playwright)

We can use a hard count while scrolling and as soon we reach that max count we get out of the loop.
b=0;
boolean x = true;
while (x){
WebElement button = null;
try {
button = driver.findElement(By.xpath("//*[#id='vjs_video_3']/div[7]/div[1]/button[1]"));
x= false;
} catch (Exception ex){
JavascriptExecutor js = (JavascriptExecutor) driver;
js.executeScript("javascript:window.scrollBy(50, 80)");
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
js.executeScript("javascript:window.scrollBy(50, 50)");
b++;
System.out.println("\n"+ b);
if(b>50) {
System.out.println("out!");
break;
}
// js.executeScript("javascript:window.scrollBy(50, 180)");
// Thread.sleep(1000);
// js.executeScript("javascript:window.scrollBy(50, 150)"); // button is missing
}
}
}

Related

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

Scraping Google App All Reviews using Selenium and Python

I want to scrape all reviews from google play store for a particular app. I have prepared following script:
# App Reviews Scraper
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.android.chrome&hl=en&showAllReviews=true"
# make request
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
time.sleep(SCROLL_PAUSE_TIME)
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get everything inside <html> tag including javscript
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, 'html.parser')
reviewer = []
date = []
# review text
for span in soup.find_all("span", class_="X43Kjb"):
reviewer.append(span.text)
# review date
for span in soup.find_all("span", class_="p2TkOb"):
date.append(span.text)
print(len(reviewer))
print(len(date))
However, it always showing 203 only. There are 35,474,218 number of reviews. So, how can I download all the reviews?

wait=WebDriverWait(driver,1)
try:
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[text()='Show More']"))).click()
except:
continue
Just add this to check for the show more element in your infinite scroll.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

An easier method to scrape app data from play store
!pip install google_play_scraper
from google_play_scraper import app
#US Market Google play store reviews
from google_play_scraper import Sort, reviews_all
us_reviews = reviews_all(
'add the app id here-using the string mentioned after id value in your code', # use the id from the play
store hyperlink that you have used above
sleep_milliseconds=0, # defaults to 0
lang='en', # defaults to 'en, can change to other lang as well'
country='us', # defaults to 'us'
sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
)
Convert into a data frame
df = pd.DataFrame(np.array(us_reviews ),columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))

I think there's no way to extract all reviews due to Google's limit. For example, com.collectorz.javamobile.android.books app has 2470 reviews and 879 is actually being shown after scrolling to the very end of reviews, which is 64.41% decrease change.
Calculation example:
(879 - 2470)/2470 = -64.41% (64.41% decrease)
In the Chrome dev tools after scrolling to the very end of reviews:
$$(".X5PpBb")
[0 … 99]
[100 … 199]
[200 … 299]
[300 … 399]
[400 … 499]
[500 … 599]
[600 … 699]
[700 … 799]
[800 … 878]
length: 879 👈👈👈
In the new UI there's a Show More button that appears and execution could stop/stuck or throw an error thus fewer reviews.
To extract all available data, you need to check if the See all reviews button is present. The button may be absent if the app has few or no reviews. If the button is present, then you need to click on it and wait until the data is loaded:
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
When the data has loaded, you need to scroll the page. You can make a small change to your page scrolling algorithm. If the variables new_height and old_height are equal, then the program looks for the Show More button selector. If this button exists, then the program is clicking on it and proceed to the next step:
if new_height == old_height:
try:
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
Code and full example in online IDE:
import time, lxml, re, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
URL = "https://play.google.com/store/apps/details?id=com.collectorz.javamobile.android.books&hl=en"
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--lang=en")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=service, options=options)
driver.get(URL)
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
# scrolling
while True:
driver.execute_script("document.querySelector('.fysCi').scrollTo(0, document.querySelector('.fysCi').scrollHeight)")
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
try:
# if "Show More" button present
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
old_height = new_height
# done scrolling
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
user_comments = []
# exctracting comments
for index, comment in enumerate(soup.select(".RHo1pe"), start=1):
comment_likes = comment.select_one(".AJTPZc")
user_comments.append({
"position": index,
"user_name": comment.select_one(".X5PpBb").text,
"user_avatar": comment.select_one(".gSGphe img").get("srcset").replace(" 2x", ""),
"user_comment": comment.select_one(".h3YV2d").text,
"comment_likes": comment_likes.text.split("people")[0].strip() if comment_likes else None,
"app_rating": re.search(r"\d+", comment.select_one(".iXRFPc").get("aria-label")).group(),
"comment_date": comment.select_one(".bp9Aid").text
})
print(json.dumps(user_comments, indent=2, ensure_ascii=False))
If you want to extract reviews much faster, you can use Google Play Product Reviews API from SerpApi. It will bypass blocks from search engines and you don't have to create the parser from scratch and maintain it.
Code example that paginates through all pages and extracts reviews:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import os, json
params = {
# https://docs.python.org/3/library/os.html#os.getenv
'api_key': os.getenv('API_KEY'), # your serpapi api
"engine": "google_play_product", # serpapi parsing engine
"store": "apps", # app results
"gl": "us", # country of the search
"hl": "en", # language of the search
"product_id": "com.collectorz.javamobile.android.books" # app id
}
search = GoogleSearch(params) # where data extraction happens on the backend
reviews = []
while True:
results = search.get_dict() # JSON -> Python dict
for review in results["reviews"]:
reviews.append({
"title": review.get("title"),
"avatar": review.get("avatar"),
"rating": review.get("rating"),
"likes": review.get("likes"),
"date": review.get("date"),
"snippet": review.get("snippet"),
"response": review.get("response")
})
# pagination
if "next" in results.get("serpapi_pagination", {}):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination", {}).get("next")).query)))
else:
break
print(json.dumps(reviews, indent=2, ensure_ascii=False))
There's a Scrape All Google Play App Reviews in Python blog, which in detail shows how to extract all reviews.
Disclaimer, I work for SerpApi.

scroll to bottom of page before scraping with selenium

There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[#class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[#class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[#class="interest"]').text
category = row.find_element_by_xpath('.//div[#class="categ_type"]').text
received = row.find_element_by_xpath('.//div[#id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[#id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[#class="date"]').text
ticker = row.find_element_by_xpath('.//div[#id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available

I always use the below piece of code to scroll till bottom, and I have never seen that it fail.
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
So, your effective code will be
while True:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(15)
if newHeight == lastHeight:
break
lastHeight = newHeight

If you use print() to see values in variables then you see that scrollTo gives None and you can't use it to get newHeight.
Minimal working code.
I tested on page http://quotes.toscrape.com/scroll created for learning scraping.
from selenium import webdriver
import time
url = 'http://quotes.toscrape.com/scroll'
driver = webdriver.Firefox()
driver.get(url)
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(1)
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
BTW:
I found on Stackoverflow answer from 2015 which use the same method but with document.body instead of document.documentElement
How can I scroll a web page using selenium webdriver in python?
So if this code works for you then this question could be closed as duplicate

Selenium Not Locating ALL elements with specific class name

I'm creating a web crawler for Zillow in order to practice using Selenium. All I'm trying to do is get the price, address, and link to each home, but when I use find_elements_by_class_name() or find_elements_by_css_selector(), it only finds the first 9 elements, when there are many more.
Normally my selenium works fine. Does anyone know why this occurs?
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
time.sleep(2)
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")

Try this.
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
prices = []
addresses = []
links = []
time.sleep(2)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while (condition):
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Just put the condition as len(prices) <= number of houses you wanna scrape

How can I scroll a web page using selenium webdriver in python?

I am currently using selenium webdriver to parse through facebook user friends page and extract all ids from the AJAX script. But I need to scroll down to get all the friends. How can I scroll down in Selenium. I am using python.

You can use
driver.execute_script("window.scrollTo(0, Y)")
where Y is the height (on a fullhd monitor it's 1080). (Thanks to #lukeis)
You can also use
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
to scroll to the bottom of the page.
If you want to scroll to a page with infinite loading, like social network ones, facebook etc. (thanks to #Cuong Tran)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
another method (thanks to Juanse) is, select an object and
label.sendKeys(Keys.PAGE_DOWN);

If you want to scroll down to bottom of infinite page (like linkedin.com), you can use this code:
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Reference: https://stackoverflow.com/a/28928684/1316860

You can use send_keys to simulate an END (or PAGE_DOWN) key press (which normally scroll the page):
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
html = driver.find_element(By.TAG_NAME, 'html')
html.send_keys(Keys.END)

same method as shown here:
in python you can just use
driver.execute_script("window.scrollTo(0, Y)")
(Y is the vertical position you want to scroll to)

element=find_element_by_xpath("xpath of the li you are trying to access")
element.location_once_scrolled_into_view
this helped when I was trying to access a 'li' that was not visible.

For my purpose, I wanted to scroll down more, keeping the windows position in mind. My solution was similar and used window.scrollY
driver.execute_script("window.scrollTo(0, window.scrollY + 200)")
which will go to the current y scroll position + 200

This is how you scroll down the webpage:
driver.execute_script("window.scrollTo(0, 1000);")

None of these answers worked for me, at least not for scrolling down a facebook search result page, but I found after a lot of testing this solution:
while driver.find_element_by_tag_name('div'):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Divs=driver.find_element_by_tag_name('div').text
if 'End of Results' in Divs:
print 'end'
break
else:
continue

The easiest way i found to solve that problem was to select a label and then send:
label.sendKeys(Keys.PAGE_DOWN);
Hope it works!

When working with youtube the floating elements give the value "0" as the scroll height
so rather than using "return document.body.scrollHeight" try using this one "return document.documentElement.scrollHeight"
adjust the scroll pause time as per your internet speed
else it will run for only one time and then breaks after that.
SCROLL_PAUSE_TIME = 1
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this dowsnt work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
print("break")
break
last_height = new_height

scroll loading pages. Example: medium, quora,etc
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight-1000);")
# Wait to load the page.
driver.implicitly_wait(30) # seconds
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# sleep for 30s
driver.implicitly_wait(30) # seconds
driver.quit()

Here's an example selenium code snippet that you could use for this type of purpose. It goes to the url for youtube search results on 'Enumerate python tutorial' and scrolls down until it finds the video with the title: 'Enumerate python tutorial(2020).'
driver.get('https://www.youtube.com/results?search_query=enumerate+python')
target = driver.find_element_by_link_text('Enumerate python tutorial(2020).')
target.location_once_scrolled_into_view

This code scrolls to the bottom but doesn't require that you wait each time. It'll continually scroll, and then stop at the bottom (or timeout)
from selenium import webdriver
import time
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://example.com')
pre_scroll_height = driver.execute_script('return document.body.scrollHeight;')
run_time, max_run_time = 0, 1
while True:
iteration_start = time.time()
# Scroll webpage, the 100 allows for a more 'aggressive' scroll
driver.execute_script('window.scrollTo(0, 100*document.body.scrollHeight);')
post_scroll_height = driver.execute_script('return document.body.scrollHeight;')
scrolled = post_scroll_height != pre_scroll_height
timed_out = run_time >= max_run_time
if scrolled:
run_time = 0
pre_scroll_height = post_scroll_height
elif not scrolled and not timed_out:
run_time += time.time() - iteration_start
elif not scrolled and timed_out:
break
# closing the driver is optional
driver.close()
This is much faster than waiting 0.5-3 seconds each time for a response, when that response could take 0.1 seconds

I was looking for a way of scrolling through a dynamic webpage, and automatically stopping once the end of the page is reached, and found this thread.
The post by #Cuong Tran, with one main modification, was the answer that I was looking for. I thought that others might find the modification helpful (it has a pronounced effect on how the code works), hence this post.
The modification is to move the statement that captures the last page height inside the loop (so that each check is comparing to the previous page height).
So, the code below:
Continuously scrolls down a dynamic webpage (.scrollTo()), only stopping when, for one iteration, the page height stays the same.
(There is another modification, where the break statement is inside another condition (in case the page 'sticks') which can be removed).
SCROLL_PAUSE_TIME = 0.5
while True:
# Get scroll height
### This is the difference. Moving this *inside* the loop
### means that it checks if scrollTo is still scrolling
last_height = driver.execute_script("return document.body.scrollHeight")
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# try again (can be removed)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
# check if the page height has remained the same
if new_height == last_height:
# if so, you are done
break
# if not, move on to the next loop
else:
last_height = new_height
continue

You can use send_keys to simulate a PAGE_DOWN key press (which normally scroll the page):
from selenium.webdriver.common.keys import Keys
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_DOWN)

if you want to scroll within a particular view/frame (WebElement), what you only need to do is to replace "body" with a particular element that you intend to scroll within. i get that element via "getElementById" in the example below:
self.driver.execute_script('window.scrollTo(0, document.getElementById("page-manager").scrollHeight);')
this is the case on YouTube, for example...

The ScrollTo() function doesn't work anymore. This is what I used and it worked fine.
driver.execute_script("document.getElementById('mydiv').scrollIntoView();")

According to the docs,
the class ActionChains does the job:
from selenium import webdriver
from selenium.webdriver import ActionChains
driver = webdriver.Firefox()
action_chains = ActionChains(driver)
action_chains.scroll(x: int, y: int, delta_x: int, delta_y: int, duration: int = 0, origin: str = 'viewport').perform()

insert this line driver.execute_script("window.scrollBy(0,925)", "")

The loop using the "send keys" method of scrolling the page:
pre_scroll_height = driver.execute_script('return document.body.scrollHeight;')
while True:
driver.find_element_by_tag_name('body').send_keys(Keys.END)
time.sleep(5)
post_scroll_height = driver.execute_script('return document.body.scrollHeight;')
print(pre_scroll_height, post_scroll_height)
if pre_scroll_height == post_scroll_height:
break
pre_scroll_height=post_scroll_height

Here is a method I wrote to slowly scroll down to a targets element
You can pass either Y-th position of element of the CSS Selector to it
It scrolls exactly like we do via mouse-wheel
Once this method called, you call it again with same driver object but with new target element, it will then scroll up/down wherever that element exists
def slow_scroll_to_element(self, driver, element_selector=None, target_yth_location=None):
current_scroll_position = int(driver.execute_script("return window.scrollY"))
if element_selector:
target_yth_location = int(driver.execute_script("return document.querySelector('{}').getBoundingClientRect()['top'] + window.scrollY".format(element_selector)))
scrollSpeed = 100 if target_yth_location-current_scroll_position > 0 else -100
def chunks(a, n):
k, m = divmod(len(a), n)
return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
for l in list(chunks(list(range(current_scroll_position, target_yth_location, scrollSpeed)) + list([target_yth_location+(-scrollSpeed if scrollSpeed > 0 else scrollSpeed)]), 3)):
for pos in l:
driver.execute_script("window.scrollTo(0, "+str(pos)+");")
time.sleep(0.1)
time.sleep(random.randint(1,3))

driver.execute_script("document.getElementById('your ID Element').scrollIntoView();")
it's working for my case.

Just a small variation of the solutions provided so far: sometimes in scraping you have to meet the following requirements:
Keep scrolling step by step. Otherwise if you always jump to the bottom some elements are loaded only as containers/divs but their content is not loaded because they were never visible (because you jumped straight to the bottom);
Allow enough time for content to be loaded;
It's not an infinite scroll page, there is an end and you have to identify when the end is reached;
Here is a simple implementation:
from time import sleep
def keep_scrolling_to_the_bottom():
while True:
previous_scrollY = my_web_driver.execute_script( 'return window.scrollY' )
my_web_driver.execute_script( 'window.scrollBy( 0, 230 )' )
sleep( 0.4 )
if previous_scrollY == my_web_driver.execute_script( 'return window.scrollY' ):
print( 'job done, reached the bottom!' )
break
Tested and working on Windows 7 x64, Python 3.8.0, selenium 4.1.3, Google Chrome 107.0.5304.107, website for property rent.

Scroll to an element: Find the element and scroll using this code.
scroll_element = driver.find_element(By.XPATH, "your element xpath")
driver.execute_script("arguments[0].scrollIntoView();", scroll_element)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to check if further `scroll down` is not possible using Selenium - python

You can check document.body.scrollTop by before and after each scroll attempt if there is no data to fetch then this value will stay the same distanceToTop = driver.execute_script("return document.body.scrollTop);")

Related

How to scrape all the comments of a youtube video using selenium, python

Scraping Google App All Reviews using Selenium and Python

scroll to bottom of page before scraping with selenium

Selenium Not Locating ALL elements with specific class name

How can I scroll a web page using selenium webdriver in python?

Categories

Resources