Selenium scraping some rows but failing to scrape all rows

Selenium scraping some rows but failing to scrape all rows - python

I am trying to scrape rows and columns from a website to a csv file. The code I used is working fine but it is providing me only 14 rows out of the total 20 rows. Can someone please help me scrape all the 20 rows.
I am providing the code:
browser = webdriver.Chrome()
url = 'https://app.powerbi.com/home'
browser.get(url)
members = (WebDriverWait(browser, 30).until(EC.visibility_of_all_elements_located([By.XPATH, '//*[contains(#class,"row ng-star-inserted")]'])))
height = browser.execute_script("return document.body.scrollHeight")
data = []
while True:
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
for item in browser.find_elements(By.XPATH, '//*[contains(#class, "row ng-star-inserted")]'):
if item.text in data:
continue
else:
data.append(item.text)
lastheight = browser.execute_script("return document.body.scrollHeight")
if height == lastheight:
break
height = lastheight
print(data)
I am also providing a screenshot of the page source. All the rows use the same
enter image description here

Related

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

scroll to bottom of page before scraping with selenium

There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[#class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[#class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[#class="interest"]').text
category = row.find_element_by_xpath('.//div[#class="categ_type"]').text
received = row.find_element_by_xpath('.//div[#id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[#id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[#class="date"]').text
ticker = row.find_element_by_xpath('.//div[#id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available

I always use the below piece of code to scroll till bottom, and I have never seen that it fail.
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
So, your effective code will be
while True:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(15)
if newHeight == lastHeight:
break
lastHeight = newHeight

If you use print() to see values in variables then you see that scrollTo gives None and you can't use it to get newHeight.
Minimal working code.
I tested on page http://quotes.toscrape.com/scroll created for learning scraping.
from selenium import webdriver
import time
url = 'http://quotes.toscrape.com/scroll'
driver = webdriver.Firefox()
driver.get(url)
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(1)
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
BTW:
I found on Stackoverflow answer from 2015 which use the same method but with document.body instead of document.documentElement
How can I scroll a web page using selenium webdriver in python?
So if this code works for you then this question could be closed as duplicate

Selenium Not Locating ALL elements with specific class name

I'm creating a web crawler for Zillow in order to practice using Selenium. All I'm trying to do is get the price, address, and link to each home, but when I use find_elements_by_class_name() or find_elements_by_css_selector(), it only finds the first 9 elements, when there are many more.
Normally my selenium works fine. Does anyone know why this occurs?
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
time.sleep(2)
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")

Try this.
from selenium import webdriver
import time
zillow_url = "https://www.zillow.com/manhattan-new-york-ny/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.21047920019531%2C%22east%22%3A-73.73669379980468%2C%22south%22%3A40.626191262639644%2C%22north%22%3A40.933477919520115%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22beds%22%3A%7B%22min%22%3A0%2C%22max%22%3A0%7D%2C%22price%22%3A%7B%22max%22%3A400000%7D%2C%22mp%22%3A%7B%22max%22%3A1300%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
address = "My chrome driver address"
driver = webdriver.Chrome(executable_path=address)
driver.get(zillow_url)
prices = []
addresses = []
links = []
time.sleep(2)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while (condition):
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
prices = driver.find_elements_by_class_name("list-card-price")
addresses = driver.find_elements_by_class_name("list-card-addr")
links = driver.find_elements_by_class_name("list-card-link")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Just put the condition as len(prices) <= number of houses you wanna scrape

How to scrape multiple divs (and put them in a csv)?

I have this code to scrape tagged users ids from medias on twitter:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
import re
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# go to page
driver.get("http://twitter.com/RussiaUN/media")
#You can adjust it but this works fine
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Now that the page is fully scrolled, grab the source code.
src = driver.page_source
#Past it into BS
soup = BeautifulSoup(src, 'html.parser')
#divs = soup.find_all('div',class_='account')
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
#PRINT RESULT
#print('printing results')
#for div in divs:
# print(div['data-user-id'])
#SAVE IN FILE
print('Saving results')
#with open('file2.csv','w') as f:
# for div in divs:
# f.write(div['data-user-id']+'\n')
with open('file.csv','w', newline='') as f:
writer = csv.writer(f)
for div in divs:
writer.writerow([div['data-user-id']])
-But I would like to also scrape the usernames and then organise all these datas in a csv with a column IDS and a column USERNAMES.
So my guess is that I have to modify this piece of code first:
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
But I can't find a way to achieve that...
-Then I also have a problem with duplicates. As you can see in the code there are two ways to scrape the data:
1 #divs = soup.find_all('div',class_='account')
2 divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
The first phrase seemed to work but was not efficient enough. Number 2 works fine but seems to give me dupplicates at the end as it goes through all the divs and not only the class_='account'.
I'm sorry if some feel that I'm a bit spammy here as I posted 3 questions in 24h...And thanks to those who helped and will be helping.

Python has an inbuilt csv module for writing csv files.
Also the scroll script that you used did not seem to work as it was not scrolling all the way down and stopped after a certain amount of time. I just got ~ 1400 records in the csv file with your script.I have replaced it with pagedown key. You may want to tweak the no_of_pagedowns to control the amount you want to scroll down. Even with 200 pagedowns i got ~2200 records. Note that this number is without removing the duplicates.
I have added some additional modifications to write only the unique data to file.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
driver = webdriver.Firefox()
driver.get("http://twitter.com/RussiaUN/media")
time.sleep(1)
elem = driver.find_element_by_tag_name("html")
no_of_pagedowns = 200
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
no_of_pagedowns-=1
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
divs = soup.find_all('div',class_='account')
all_data=[]
#get only unique data
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
if single not in all_data:
all_data.append(single)
with open('file.csv','w') as f:
writer = csv.writer(f, delimiter=",")
#headers
writer.writerow(["ID","USERNAME"])
writer.writerows(all_data)
Output
ID,USERNAME
255493944,MID_RF
2230446228,Rus_Emb_Sudan
1024596885661802496,ambrus_drc
2905424987,Russie_au_Congo
2174261359,RusEmbUganda
285532415,tass_agency
34200559,rianru
40807205,kpru
177502586,nezavisimaya_g
23936177,vzglyad
255471924,mfa_russia
453639812,pass_blue
...
If you want the duplicates just remove the if condition
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
all_data.append(single)

How to scrape in the form of a table so lists become even

I am using Selenium Webdriver (Python 3.0) to scrape data off this website. All the data is scraped correctly however, it scrapes in the form of a list meaning there are 127 Team and odds and 129 Hrefs. Unfortunately, this means the Href is not beside the Team and odds correctly in the excel. Is there a way to get around this?
I have attached screenshots and my code is below. Is there a way to adjust this to scrape as a table so that it knows not to scrape Href where no Team and odds visible? I am migrating from Winautomation which had this feature to Selenium.
https://ibb.co/kMC0mk - Picture showing why the Href is not beside Team and odds
https://ibb.co/hh4rsQ - What the Excel looks like.
import time
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r'C:\ad\chromedriver.exe')
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.bluebet.com.au/sports/Soccer/100')
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
time.sleep( 15 )
#Odds
langs = driver.find_elements_by_css_selector(".table-grid__row:nth-child(1) .headline-wrap")
for lang in langs:
print (lang.text)
time.sleep( 10 )
#link
langs = driver.find_elements_by_css_selector("div.ctr--epsilon.soft > a[href*='/sports/Soccer/']")
for lang in langs:
print (lang.get_attribute('href'))
time.sleep( 10 )
#Team
langs = driver.find_elements_by_css_selector(".table-grid__row:nth-child(1) .place-bet__odds")
for lang in langs:
print (lang.text)

I agree with #ChellWheatly, couldn't find a way to do this with CSS.
Try this xpath selector to scrape only the "Hrefs" that have content:
//a/ancestor::div[contains(#class, 'table-grid')]/preceding-sibling::div[contains(#class, 'ctr--epsilon')]//a
(I've tested this on the real page with this chrome extension)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Selenium scraping some rows but failing to scrape all rows - python

Related

How to scrape all the comments of a youtube video using selenium, python

scroll to bottom of page before scraping with selenium

Selenium Not Locating ALL elements with specific class name

How to scrape multiple divs (and put them in a csv)?

How to scrape in the form of a table so lists become even

Categories

Resources