youtube comment number scraping with youtube - python

in my project i have tried to scrape youtube viewers number, comment number , likes and dislikes numbers. I cant take comments number i have tried different methods but nothing change. here is my code please help me :
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#we choose our browser chromedriver must be in the path
driver = webdriver.Chrome()
#we need data to save variables
data = {'Likes' : [], 'Dislikes' : [], 'Comments' : [], 'Views' : []}
dataframe = pd.DataFrame(data)
# we get the link
driver.get("https://www.youtube.com/watch?v=fHI8X4OXluQ")
# we wait for opening the link
time.sleep(5)
# we find element by xpatch which means manually
Likes = driver.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/div[8]/div[2]/ytd-video-primary-info-
renderer/div/div/div[3]/div/ytdmenu-renderer/div[2]/ytd-toggle-button-renderer[1]/a/yt-
formatted-string').text
Dislikes = driver.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/div[8]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/div/ytd-
menu-renderer/div[2]/ytd-toggle-button-renderer[2]/a/yt-formatted-string').text
View = driver.find_elements_by_xpath('//div[#id="count"]')
Comments=driver.find_elements_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/ytd-comments/ytd-item-section-renderer/div[1]/ytd-comments-header-
renderer/div[1]/h2/yt-formatted-string/span[1]')
print(Likes)
print(Dislikes)
print(View[1].text)
print(Comments)
driver.quit()

Basically something like this should work
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#we choose our browser chromedriver must be in the path
driver = webdriver.Chrome()
#we need data to save variables
data = {'Likes' : [], 'Dislikes' : [], 'Comments' : [], 'Views' : []}
dataframe = pd.DataFrame(data)
# we get the link
driver.get("https://www.youtube.com/watch?v=fHI8X4OXluQ")
# we wait for opening the link
time.sleep(5)
likes_xpath = '(//div[#id="top-level-buttons-computed"]//*[contains(#aria-label," likes")])[last()]'
# we find element by xpatch which means manually
Likes = driver.find_element_by_xpath(likes_xpath).text
dislikes_xpath = '//div[#id="top-level-buttons-computed"]//*[contains(#aria-label," dislikes")]'
Dislikes = driver.find_element_by_xpath(dislikes_xpath).text
views_xpath = '//*[name()="ytd-video-view-count-renderer"]/span[#class="view-count style-scope ytd-video-view-count-renderer"]'
View = driver.find_elements_by_xpath(views_xpath)
comments_xpath = '//*[name()="ytd-comment-renderer"]//*[name()="yt-formatted-string" and #id="content-text"]'
Comments=driver.find_elements_by_xpath(comments_xpath)
print(Likes)
print(Dislikes)
print(View[1].text)
print(Comments)
driver.quit()
However there are a lot of comments there, so in order to get all of them you will have to scroll this page

See if this works for comments count:-
elem = driver.find_element_by_xpath(".//div[#class='style-scope ytd-comments-header-renderer' and #id='title']//following-sibling::yt-formatted-string[contains(#class,'ytd-comments-header-renderer')]/span[1]")
driver.execute_script("arguments[0].scrollIntoView();", elem)
elem.text

Related

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.
To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))
The problem there is that the selector depends on whether the window is expanded or not

How to scrap each product page (comments and custumer country)

I am trying to scrape each product page from aliexpress website in order to get number of comments, number of photos published by the custumer and also the custumer country and put it to a dataframe.
I have written a code that scrape custumer country but I don't know how to get the number of custumer comments and the number of images.
This is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
I would appreciate any help from you! Thank you !
If you want numbers of comments / reviews, you can just check the value in this section :
driver.find_element(By.XPATH, 'XPATH_OF_ELEMENT_TO_SCRAP')
To do so in your exemple lets do this outside your loop :
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
If you dont understand or know this function, please feal free to ask and I will explain where I found theses XPATH.We also can use find by id function.
In your code it would be :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
print(f'number_feedbacks = {number_feedbacks}\nnumber_images = {number_images}')
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

iterate over a list in Selenium and saving the results into a dataframe

I'm trying to iterate over a list, search on a webpage via selenium and store the results in a df. How can store the loop results from each list item into a df?
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
url = 'https://au.finance.yahoo.com/australia/'
driver_path = 'chromedriver.exe'
browser = Chrome(executable_path= driver_path)
loop_search = browser.find_element_by_id('yfin-usr-qry')
search_companies = ['Commonwealth Bank','Rio Tinto','Wesfarmers']
for i in search_companies:
loop_search.send_keys(i)
browser.find_element_by_id('search-button').click()
comp = browser.find_element_by_id('quote-header-info').text
df3 = [comp]```
Still fairly new to Python! Thank you!
If you just run your code and do print(comp)
you'd see the below error:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=99.0.4844.74)
so before saving it to the DF, we need to resolve this:
that can be fixed by redefining the web element like this in the loop:
loop_search = wait.until(EC.visibility_of_element_located((By.ID, "yfin-usr-qry")))
Full code to save it to DF:
driver_path = 'chromedriver.exe'
browser = Chrome(executable_path= driver_path)
wait = WebDriverWait(driver, 20)
url = 'https://au.finance.yahoo.com/australia/'
driver.get(url)
search_companies = ['Commonwealth Bank','Rio Tinto','Wesfarmers']
company_details_lst = []
for i in search_companies:
time.sleep(2)
loop_search = wait.until(EC.visibility_of_element_located((By.ID, "yfin-usr-qry")))
loop_search.send_keys(i)
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.ID, "search-button"))).click()
time.sleep(2)
comp = wait.until(EC.element_to_be_clickable((By.ID, "quote-header-info"))).text
company_details_lst.append(comp)
#print(comp)
data = {
'Details': company_details_lst
}
df = pd.DataFrame.from_dict(data)
df.to_csv('output.csv', index = 0
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
after running your code you should see a csv file in your project folder with name as output.csv
and the internal content would be:

For loop keeps iterating over first piece of data in list

This code scrapes the HTML table from https://www.asx.com.au/asx/statistics/prevBusDayAnns.do and downloads PDF files for specific ASX Codes and Headlines. When the for loop iterates over the ASX Codes found in 'data', it iterates over the first ASX Code five times which creates five duplicate of the same PDF. For example, in the code below there would be five copies of TWD. The amount of times the for loop iterates over the first ASX code is equal to the amount of ASX Codes in 'data'. For example, if there were ten codes, I would end up with ten copies of PDF files for TWD. This only happens to the first ASX Code, everything else is fine. Any reason why this is happening?
Relevant code:
driver.get("https://www.asx.com.au/asx/statistics/prevBusDayAnns.do")
data = ['TWD', 'GEM', 'AT1','TKF','GDF']
asxcodes = []
for d in data:
try:
asxcode = driver.find_element_by_xpath("//table//tr//td[text()='{}']/following-sibling::td[3]/a[contains(.,'{}')]".format(d,"Becoming a substantial holder")).get_attribute("href")
asxcodes.append(asxcode)
except:
pass
Entire code:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
chromeOptions = webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True,"download.default_directory" : r"C:\Users\Harrison Pollock\Desktop\The Smarts\Becoming a Substantial Holder"}
chromeOptions.add_experimental_option("prefs",prefs)
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
driver.get("https://www.asx.com.au/asx/statistics/prevBusDayAnns.do")
data = ['TWD', 'GEM', 'AT1','TKF','GDF'
asxcodes = []
for d in data:
try:
asxcode = driver.find_element_by_xpath("//table//tr//td[text()='{}']/following-sibling::td[3]/a[contains(.,'{}')]".format(d,"Becoming a substantial holder")).get_attribute("href")
asxcodes.append(asxcode)
except:
pass
for asxcode in asxcodes:
driver.get(asxcode)
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "//input[#value='Agree and proceed']"))).click()
time.sleep(10)
Instead of getting all href value and then iterate could you try something like click on each link and then click for download.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
chromeOptions = webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True,"download.default_directory" : r"C:\Users\Harrison Pollock\Desktop\The Smarts\Becoming a Substantial Holder"}
chromeOptions.add_experimental_option("prefs",prefs)
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
driver.get("https://www.asx.com.au/asx/statistics/prevBusDayAnns.do")
data = ['TWD', 'GEM', 'AT1','TKF','GDF']
asxcodes = []
for d in data:
try:
driver.find_element_by_xpath("//table//tr//td[text()='{}']/following-sibling::td[3]/a[contains(.,'{}')]".format(d,"Becoming a substantial holder")).click()
WebDriverWait(driver,5).until(EC.number_of_windows_to_be(2))
driver.switch_to.window(driver.window_handles[-1])
WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, "//input[#value='Agree and proceed']"))).click()
time.sleep(10)
driver.close()
driver.switch_to.window(driver.window_handles[-1])
except:
driver.switch_to.window(driver.window_handles[-1])
continue
Hope this logic helps.

How to print the open pdf link from using Selenium in Python?

I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link
You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on
I would say you just need to put this line:
pdf_link = elm.get_attribute("href")
Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))

Categories