Job does not write to CSV or print row from scraping - python

I am trying to extract data and write to a csv in the below code. As far as I can tell Selenium tends to print all the elements except for it does not seem to print the row or write to CSV. I’ve had this issue in the past though generally this was easily addressed by re-writing the job. I am using windows. Unfortunately I do not know why it is doing this as the job works functionally otherwise.
My code below is:
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
try:
os.remove('vtg121.csv')
except OSError:
pass
driver.get('https://www.topbetta.com.au/sports/football/')
SCROLL_PAUSE_TIME = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
time.sleep(1)
url = "https://www.topbetta.com.au/sports/football/"
driver.get(url)
counter = 0
for link in range(len(wait(driver, 15).until(EC.presence_of_all_elements_located((By.XPATH, '//a[#href="/sports" and ./div[#class="name"]]'))))):
wait(driver, 15).until_not(EC.visibility_of_element_located((By.CLASS_NAME, "mask")))
link = wait(driver, 15).until(EC.presence_of_all_elements_located((By.XPATH, '//a[#href="/sports" and ./div[#class="name"]]')))[counter]
link.location_once_scrolled_into_view
link = wait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '(//a[#href="/sports" and ./div[#class="name"]])[%s]' % str(counter + 1))))
wait(driver, 15).until_not(EC.visibility_of_element_located((By.CLASS_NAME, "mask")))
header = driver.find_element_by_tag_name('header')
header = driver.execute_script('arguments[0].hidden="true";', header)
header = driver.find_element_by_xpath('//*[#id="js_body-region"]/div/div[1]/div[2]/div[2]/div/div[3]')
header = driver.execute_script('arguments[0].hidden="true";', header)
link.click()
print(driver.current_url)
wait(driver, 10).until(EC.staleness_of(driver.find_element(By.XPATH, '//div[#class="competition-events-module"]')))
time.sleep(2)
time.sleep(2)
# link
# Team ODDS
langs = driver.find_elements_by_css_selector(".team-container.home div")
langs_text = []
for lang in langs:
print(lang.text)
langs_text.append(lang.text)
time.sleep(0)
# BACK TEAM
#langs1 = driver.find_elements_by_xpath("//ul[#class='runners']//li[2]")
langs1 = driver.find_elements_by_css_selector(".home .price")
langs1_text = []
for lang in langs1:
print(lang.text)
langs1_text.append(lang.text)
# Draw Odds
#langs2 = driver.find_elements_by_xpath("//ul[#class='runners']//li[1]")
langs2 = driver.find_elements_by_css_selector("td.draw.odds > div > a > span")
langs2_text = []
for lang in langs2:
print(lang.text)
langs2_text.append(lang.text)
with open('vtg121.csv', 'a', newline='', encoding="utf-8") as outfile:
writer = csv.writer(outfile)
for row in zip(langs_text, langs1_text, langs2_text):
print(row)
writer.writerow(row)
counter += 1
driver.get(url)

Related

Python selenium no schema supplied while downloading images

The script should download images from IG and it seems to fail locate the url src of the image. Even with defining the class name. This code used to work last year and all I did is changed the class name and updated driver.find_elements() since it was changed in selenium. I am getting an error:
requests.exceptions.MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
If I add print(img.get_attribute("src")) I get None.
Full code:
import requests
import selenium.webdriver as webdriver
import time
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements(By.CLASS_NAME, "_aagv")
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print(img.get_attribute("src"))
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
Any ideas why this would behave like that?

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

scroll to bottom of page before scraping with selenium

There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[#class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[#class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[#class="interest"]').text
category = row.find_element_by_xpath('.//div[#class="categ_type"]').text
received = row.find_element_by_xpath('.//div[#id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[#id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[#class="date"]').text
ticker = row.find_element_by_xpath('.//div[#id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available
I always use the below piece of code to scroll till bottom, and I have never seen that it fail.
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
So, your effective code will be
while True:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(15)
if newHeight == lastHeight:
break
lastHeight = newHeight
If you use print() to see values in variables then you see that scrollTo gives None and you can't use it to get newHeight.
Minimal working code.
I tested on page http://quotes.toscrape.com/scroll created for learning scraping.
from selenium import webdriver
import time
url = 'http://quotes.toscrape.com/scroll'
driver = webdriver.Firefox()
driver.get(url)
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(1)
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
BTW:
I found on Stackoverflow answer from 2015 which use the same method but with document.body instead of document.documentElement
How can I scroll a web page using selenium webdriver in python?
So if this code works for you then this question could be closed as duplicate

Name Conte not defined in python

import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedriver.exe')
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this dowsnt work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte= soup.find_all('div',class_='product-card__body')
wshoes=[]
for items in conte:
try:
title= items.find('div',class_ = 'product-card__title').text
except:
title=''
try:
sub_title = items.find('div',class_ ='product-card__subtitle').text
except:
sub_title=''
try:
color = items.find('div',{'class':'product-card__product-count'}).text
except:
color=''
try:
link = items.find('a', {'class': 'product-card__link-overlay'})['href']
except:
link=''
try:
price=items.select_one('div[data-test="product-price"]').text.strip()
#item.find('div',{'class':'product-price is--current-price css-s56yt7'}).text
except:
price='-'
try:
reduce_price=items.select_one('div[data-test="product-price-reduced"]').text.strip()
#item.find('div',class_ ='product-price-reduced').text
except:
reduce_price='-'
print(title,sub_title,color,price,reduce_price,link)
shoes={
'title':title,
'Description':sub_title,
'QuatityColor':color,
'Price':price,
'Reducedprice':reduce_price,
'Url':link
}
wshoes.append(shoes)
df = pd.DataFrame(wshoes)
print(df)
df.to_csv('Nike.csv')
print('Saved to csv file')
DevTools listening on ws://127.0.0.1:58524/devtools/browser/e07b59df-6056-4144-9203-2feb91b19647
[21028:20948:0301/203812.684:ERROR:device_event_log_impl.cc(211)] [20:38:12.685] USB: usb_device_handle_win.cc:1049 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[21028:20948:0301/203812.686:ERROR:device_event_log_impl.cc(211)] [20:38:12.687] USB: usb_device_handle_win.cc:1049 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[18068:9236:0301/203825.199:ERROR:ssl_client_socket_impl.cc(962)] handshake failed; returned -1, SSL error code 1, net_error -101
The problem you appear to be having is that in some cases, your test of new_height == last_height is probably succeeding on the first attempt. As such, the variable conte is not assigned.
To get around this, initialise it to None and don't break unless it has been assigned.
For example:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedriver.exe')
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
wshoes = []
for items in conte:
try:
title= items.find('div',class_ = 'product-card__title').text
except:
title=''
try:
sub_title = items.find('div',class_ ='product-card__subtitle').text
except:
sub_title=''
try:
color = items.find('div',{'class':'product-card__product-count'}).text
except:
color=''
try:
link = items.find('a', {'class': 'product-card__link-overlay'})['href']
except:
link=''
try:
price=items.select_one('div[data-test="product-price"]').text.strip()
#item.find('div',{'class':'product-price is--current-price css-s56yt7'}).text
except:
price='-'
try:
reduce_price=items.select_one('div[data-test="product-price-reduced"]').text.strip()
#item.find('div',class_ ='product-price-reduced').text
except:
reduce_price='-'
print(title,sub_title,color,price,reduce_price,link)
shoes={
'title':title,
'Description':sub_title,
'QuatityColor':color,
'Price':price,
'Reducedprice':reduce_price,
'Url':link
}
wshoes.append(shoes)
df = pd.DataFrame(wshoes)
print(df)
df.to_csv('Nike.csv')
print('Saved to csv file')
Another possible solution would be to move the soup = and conte = lines outside the loop.

How to scrape the youtube comments with selenium in python?

I am trying to scrape youtube comments so that each row contains the title of the video, author of comment, and comment itself. As seen in the code below I open the drive successfully and get rid of some authentication and cookie messages as well. Scroll enough to get the first comments loaded. After this happens I still am not able to get the comment text by xpath as seen below.
csv_file = open('funda_youtube_comments.csv', 'w', encoding="UTF-8", newline="")
writer = csv.writer(csv_file)
writer.writerow(['title', 'comment', 'author'])
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data Mining\Project1test\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(10)
driver.get("https://www.youtube.com/watch?v=VWQaP9txG6M&t=76s")
driver.maximize_window()
time.sleep(2)
driver.execute_script('window.scrollTo(0,700);')
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[#id='dismiss-button']"))).click()
time.sleep(2)
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[src^='https://consent.google.com']")))
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[#id='introAgreeButton']"))).click()
time.sleep(2)
title = driver.title
print(title)
time.sleep(5)
totalcomments= len(driver.find_elements_by_xpath("""//*[#id="content-text"]"""))
if totalcomments < 50:
index = totalcomments
else:
index = 50
youtube_dict ={}
ccount = 0
while ccount < index:
try:
comment = driver.find_elements_by_xpath('//*[#id="content-text"]')[ccount].text
except:
comment = ""
try:
authors = driver.find_elements_by_xpath('//a[#id="author-text"]/span')[ccount].text
except:
authors = ""
try:
title = title
except:
title = ""
youtube_dict['comment'] = comment
youtube_dict['author'] = authors
youtube_dict['video title'] = title
writer.writerow(youtube_dict.values())
ccount = ccount + 1
print(youtube_dict)
driver.close()
What am I doing wrong?
If you want to make it simple, you can use tube_dl
pip install tube_dl
This module has Comments class that can help you with processing comments.
Here's the simple usage of that:
from tube_dl.comments import Comments
comments = Comments('yt url').process_comments()
#If you want limited comments, you can specify that. Ex : process_comments(count=45)
Feel free to raise issues at github.com/shekharchander/tube_dl. I'll be happy to resolve issues.
I was able to scrape youtube comments. below you can see the solution.
options = Options()
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data " \
r"Mining\Project1test\chromedriver.exe "
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.get(response.url)
time.sleep(5)
try:
title = driver.find_element_by_xpath('//*[#id="container"]/h1/yt-formatted-string').text
comment_section = driver.find_element_by_xpath('//*[#id="comments"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
driver.execute_script("arguments[0].scrollIntoView();", comment_section)
time.sleep(7)
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
try:
accounts_elems = driver.find_elements_by_xpath('//*[#id="author-text"]')
comment_elems = driver.find_elements_by_xpath('//*[#id="content-text"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
accounts = [elem.text for elem in accounts_elems]
comments = [elem.text for elem in comment_elems]
for comment_index in range(len(comment_elems)):
yield {
'title': title,
'url': driver.current_url,
'account': accounts[comment_index],
'comment': comments[comment_index]
}

Categories