scroll to bottom of page before scraping with selenium

scroll to bottom of page before scraping with selenium - python

There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[#class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[#class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[#class="interest"]').text
category = row.find_element_by_xpath('.//div[#class="categ_type"]').text
received = row.find_element_by_xpath('.//div[#id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[#id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[#class="date"]').text
ticker = row.find_element_by_xpath('.//div[#id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available

I always use the below piece of code to scroll till bottom, and I have never seen that it fail.
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
So, your effective code will be
while True:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(15)
if newHeight == lastHeight:
break
lastHeight = newHeight

If you use print() to see values in variables then you see that scrollTo gives None and you can't use it to get newHeight.
Minimal working code.
I tested on page http://quotes.toscrape.com/scroll created for learning scraping.
from selenium import webdriver
import time
url = 'http://quotes.toscrape.com/scroll'
driver = webdriver.Firefox()
driver.get(url)
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(1)
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
BTW:
I found on Stackoverflow answer from 2015 which use the same method but with document.body instead of document.documentElement
How can I scroll a web page using selenium webdriver in python?
So if this code works for you then this question could be closed as duplicate

Related

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

Selenium scraping some rows but failing to scrape all rows

I am trying to scrape rows and columns from a website to a csv file. The code I used is working fine but it is providing me only 14 rows out of the total 20 rows. Can someone please help me scrape all the 20 rows.
I am providing the code:
browser = webdriver.Chrome()
url = 'https://app.powerbi.com/home'
browser.get(url)
members = (WebDriverWait(browser, 30).until(EC.visibility_of_all_elements_located([By.XPATH, '//*[contains(#class,"row ng-star-inserted")]'])))
height = browser.execute_script("return document.body.scrollHeight")
data = []
while True:
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
for item in browser.find_elements(By.XPATH, '//*[contains(#class, "row ng-star-inserted")]'):
if item.text in data:
continue
else:
data.append(item.text)
lastheight = browser.execute_script("return document.body.scrollHeight")
if height == lastheight:
break
height = lastheight
print(data)
I am also providing a screenshot of the page source. All the rows use the same
enter image description here

Selenium Not Locating ALL elements with specific class name

I'm creating a web crawler for Zillow in order to practice using Selenium. All I'm trying to do is get the price, address, and link to each home, but when I use find_elements_by_class_name() or find_elements_by_css_selector(), it only finds the first 9 elements, when there are many more.
Normally my selenium works fine. Does anyone know why this occurs?
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
time.sleep(2)
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")

Try this.
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
prices = []
addresses = []
links = []
time.sleep(2)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while (condition):
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Just put the condition as len(prices) <= number of houses you wanna scrape

How to scrape multiple divs (and put them in a csv)?

I have this code to scrape tagged users ids from medias on twitter:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
import re
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# go to page
driver.get("http://twitter.com/RussiaUN/media")
#You can adjust it but this works fine
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Now that the page is fully scrolled, grab the source code.
src = driver.page_source
#Past it into BS
soup = BeautifulSoup(src, 'html.parser')
#divs = soup.find_all('div',class_='account')
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
#PRINT RESULT
#print('printing results')
#for div in divs:
# print(div['data-user-id'])
#SAVE IN FILE
print('Saving results')
#with open('file2.csv','w') as f:
# for div in divs:
# f.write(div['data-user-id']+'\n')
with open('file.csv','w', newline='') as f:
writer = csv.writer(f)
for div in divs:
writer.writerow([div['data-user-id']])
-But I would like to also scrape the usernames and then organise all these datas in a csv with a column IDS and a column USERNAMES.
So my guess is that I have to modify this piece of code first:
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
But I can't find a way to achieve that...
-Then I also have a problem with duplicates. As you can see in the code there are two ways to scrape the data:
1 #divs = soup.find_all('div',class_='account')
2 divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
The first phrase seemed to work but was not efficient enough. Number 2 works fine but seems to give me dupplicates at the end as it goes through all the divs and not only the class_='account'.
I'm sorry if some feel that I'm a bit spammy here as I posted 3 questions in 24h...And thanks to those who helped and will be helping.

Python has an inbuilt csv module for writing csv files.
Also the scroll script that you used did not seem to work as it was not scrolling all the way down and stopped after a certain amount of time. I just got ~ 1400 records in the csv file with your script.I have replaced it with pagedown key. You may want to tweak the no_of_pagedowns to control the amount you want to scroll down. Even with 200 pagedowns i got ~2200 records. Note that this number is without removing the duplicates.
I have added some additional modifications to write only the unique data to file.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
driver = webdriver.Firefox()
driver.get("http://twitter.com/RussiaUN/media")
time.sleep(1)
elem = driver.find_element_by_tag_name("html")
no_of_pagedowns = 200
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
no_of_pagedowns-=1
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
divs = soup.find_all('div',class_='account')
all_data=[]
#get only unique data
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
if single not in all_data:
all_data.append(single)
with open('file.csv','w') as f:
writer = csv.writer(f, delimiter=",")
#headers
writer.writerow(["ID","USERNAME"])
writer.writerows(all_data)
Output
ID,USERNAME
255493944,MID_RF
2230446228,Rus_Emb_Sudan
1024596885661802496,ambrus_drc
2905424987,Russie_au_Congo
2174261359,RusEmbUganda
285532415,tass_agency
34200559,rianru
40807205,kpru
177502586,nezavisimaya_g
23936177,vzglyad
255471924,mfa_russia
453639812,pass_blue
...
If you want the duplicates just remove the if condition
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
all_data.append(single)

Selenium scrolling and scraping with BeautifulSoup produces duplicate results

I have this script to download images from Instagram. The only issue I am having is that when Selenium starts scrolling down to the bottom of the webpage, BeautifulSoup starts grabbing the same img src links after requests is being looped.
Although it will continue to scroll down and download pictures, after all that is done, I end up having 2 or 3 duplicates. So my question is is there a way of preventing this duplication from happening?
import requests
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
url = ('https://www.instagram.com/kitties')
driver = webdriver.Firefox()
driver.get(url)
scroll_delay = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
if new_height == last_height:
break
last_height = new_height
Update:
So I placed this part of the code outside of while True and let selenium load the whole page first in order to hopefully have bs4 scrape all the images. It works to number 30 only and then stops.
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
#tn = datetime.now().strftime('%H:%M:%S')
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1

The reason it only loads 30 in your second version of your script is because the rest of the elements are removed from the page DOM and are no longer part of the source that BeautifulSoup sees. The solution is to keep doing what you were doing the first time, but to remove any duplicate elements before you iterate through the list and call screens(). You can do this using sets as below, though I'm not sure if this is the absolute most efficient way to do it:
import requests
import selenium.webdriver as webdriver
import time
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats/?hl=en')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("test_images/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements_by_class_name('_2di5p')
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
As you can see, I used a different page to test it, one with 420 images of cats. The result was 420 images, the number of posts on that account, with no duplicates among them.

I would use os library to check if file already exists
import os
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
if os.path.isfile(path/to/the/file): #checks file exists. Gives false on directory
# or if os.path.exists(path/to/the/file): #checks file/directory exists
pass
else:
r = requests.get(img_url)
f.write(r.content)
*I might have messed up the ordering of if and with statements

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

scroll to bottom of page before scraping with selenium - python

Related

How to scrape all the comments of a youtube video using selenium, python

Selenium scraping some rows but failing to scrape all rows

Selenium Not Locating ALL elements with specific class name

How to scrape multiple divs (and put them in a csv)?

Selenium scrolling and scraping with BeautifulSoup produces duplicate results

Categories

Resources