I'm sorry for my terrible English. I'm kinda new to Python. I would like to know, how to skip for loop process if Web element does not exists or fill it with other value? I've been trying to scrape youtube channel to get the title, views, and when video posted. My code looks like this:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
urls = [
'https://www.youtube.com/c/LofiGirl/videos',
'https://www.youtube.com/c/Miawaug/videos'
]
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
for i in range(10):
driver.find_element(By.TAG_NAME, "Body").send_keys(Keys.END)
driver.implicitly_wait(20)
time.sleep(5)
judul_video = []
viewers = []
tanggal_posting = []
titles = driver.find_elements(By.XPATH, "//a[#id='video-title']")
views = driver.find_elements(By.XPATH, "//div[#id='metadata-line']/span[1]")
DatePosted = driver.find_elements(By.XPATH, "//div[#id='metadata-line']/span[2]")
for title in titles:
judul_video.append(title.text)
driver.implicitly_wait(5)
for view in views:
viewers.append(view.text)
driver.implicitly_wait(5)
for posted in DatePosted:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
vid_item = {
"video_title" : judul_video,
"views" : viewers,
"date_posted" : tanggal_posting
}
df = pd.DataFrame(vid_item, columns=["video_title", "views", "date_posted"])
#df_new = df.transpose()
print(df)
filename = url.split('/')[-2]
df.to_csv(rf"C:\Users\.......\YouTube_{filename}.csv", sep=",")
driver.quit()
That code works good, but at this code:
for posted in DatePosted:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
when, some channel doing a live streaming, such as lofi Girl, I've got an error said "All arrays must be of the same length". Apparently, I had failed to create if else condition to fill streaming channel with other value such as Tanggal_posting.append("Live Stream") or else, or just skip entirely extraction data start from the title. This code below are trying to skip or filled with other value, but failed:
for posted in DatePosted:
if len(posted.text) > 0:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
else:
tanggal_posting.append("Live")
driver.implicitly_wait(5)
How can I skip the iteration just for a single video that shown doing Live Stream? or how can I fill the value with other value such as "Live Stream" by using if else condition as I mention before? Thank you so much in Advance.
Personally, I'd check first if posted is viable for a .text attribute call.
for posted in DatePosted:
_posted = posted.text.strip() if posted else None
tanggal_posting.append(_posted if _posted else "Live")
driver.implicitly_wait(5)
Alternatively:
for posted in DatePosted:
_posted = posted.text.strip() if posted else None
if not _posted:
continue
tanggal_posting.append(_posted)
driver.implicitly_wait(5)
The overall code should differ depending on your objective. Though I suppose _posted will be helpful in any of them.
Instead of collecting 3 separate lists for each data item I'd suggest to get list of videos and then extract each item and handle then:
videos = driver.find_elements(By.XPATH, "//div[#id='items']/ytd-grid-video-renderer")
for video in videos:
if not video.find_elements(By.XPATH, ".//yt-icon"): # Check if no Streaming icon
title = video.find_element(By.XPATH, ".//a[#id='video-title']")
view = video.find_element(By.XPATH, ".//div[#id='metadata-line']/span[1]")
DatePosted = video.find_element(By.XPATH, ".//div[#id='metadata-line']/span[2]")
Note that you need to call driver.implicitly_wait(<SECONDS>) only ONCE at the beginning of script!
There is a web site, I can get the data I need with Python / Selenium (I am new to Selenium and Python)
on the web page there are TABS, I can get the data on the first tab as that one is active by default, I cannot get data on the second TAB.
I attached an image: this shows the data in the overview TAB, I want to get the data in the Fundamental TAB as well. This web page is investing.com.
As for the code: (I did not use everything yet, some were added for future use)
from time import sleep, strftime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import smtplib
from email.mime.multipart import MIMEMultipart
from bs4 import BeautifulSoup
url = 'https://www.investing.com/stock-screener/?
sp=country::6|sector::a|industry::a|equityType::a|exchange::a|last::1,1220|avg_volume::250000,15950000%3Ceq_market_cap;1'
chrome_path = 'E:\\BackUp\\IT\\__Programming\\Python\\_Scripts\\_Ati\\CSV\\chromedriver'
driver = webdriver.Chrome(chrome_path)
#driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get(url)
my_name = driver.find_elements_by_xpath("//td[#data-column-name='name_trans']")
my_symbol = driver.find_elements_by_xpath("//td[#data-column-name='viewData.symbol']")
my_last = driver.find_elements_by_xpath("//td[#data-column-name='last']")
my_change = driver.find_elements_by_xpath("//td[#data-column-name='pair_change_percent']")
my_marketcap = driver.find_elements_by_xpath("//td[#data-column-name='eq_market_cap']")
my_volume = driver.find_elements_by_xpath("//td[#data-column-name='turnover_volume']")
The code Above all works.
The Xpath of the second tab does not work.
PE Ratio is in the second tab. (in the fundamentals)
I tried the three:
my_peratio = driver.find_elements_by_xpath("//*[#id="resultsTable"]/tbody/tr[1]/td[4]")
my_peratio = driver.find_elements_by_xpath("//*[#id='resultsTable']")
my_peratio = driver.find_elements_by_xpath("//td[#data-column-name='eq_pe_ratio']")
There are no error messages but the string 'my_peratio' han nothing in it. It is empty.
I really appreciate if you could direct me to the right direction.
Thanks a lot
Ati
enter image description here
Probably the data which is shown on the second tab is loaded dynamically.
In that case, you have to click on the second tab to show the data first.
driver.find_element_by_xpath("selector_for_second_tab").click()
After that it should be possible to get the data.
I am relatively new to Python, and tried my hand at Selenium to download 5 years QoQ historical financial records for 800 companies. Was quite happy with the results, but needed quite a bit of manual work arounds to get what I wanted ( which I eventually got ).
Strange thing though, sometimes it filled in the search Company with a string which was not even in the python ( ie in Numpy was ie "IP", but in the field it seacrhed for "ARIP". I suspect its using the first item in the drop downlist instead of the item typed in. (see below )
please dont laugh too much about the attached code, I am not a pro, but it works and thats all that matters.
Any advice / teaching / changing code so I can learn would be appreciated.
ps - Tried Selenium several times. Never worked. Stumbled on a three in Stackoverflow about Xpath, and it worked like a charm !
import selenium
import pandas as pd
from selenium import webdriver
from pandas import DataFrame
import time
from selenium.webdriver.support.ui import Select
import os
import shutil
import glob
driver = webdriver.Chrome()
driver.get('MyURL')
username_input = '//*[#id="username"]'
password_input = '//*[#id="password"]'
login_submit = '//*[#id="login-btn"]'
driver.find_element_by_xpath(username_input).send_keys('MyLogin')
driver.find_element_by_xpath(password_input).send_keys('MyPwD')
driver.find_element_by_xpath(login_submit).click()
time.sleep(10)
dropdown = driver.find_element_by_id("menu-Company")
dropdown.click()
driver.find_element_by_xpath('//*[#id="Financial Statements"]').click()
companyname=[]
companyname=['ABPIF','AJA','ALLA','ALT','ALUCON','AMA','AMANAH','AP','AQUA','ARIN','B','BC','BM','CHAYO','CHEWA','CHG','CI','CIG','CIMBT','CITY','CK','CKP','CM','CMC','CMO','CNS','CPW','CRANE','CRD','CSC','CSP','CSS','CTARAF','CTW','CWT','D','DDD','DELTA','DEMCO','DIF','DIMET','DOD','DOHOME','DREIT','DRT','DTAC','DTC','EA','EASON','EASTW','ECF','ECL','EE','EGATIF','EGCO','EIC','EKH','EMC','EPCO','EPG','ERW','ESSO','ETE','EVER','F&D','FANCY','FE','FLOYD','FMT','FN','FNS','FORTH','FSS','FTE','FVC','GAHREIT','GBX','GC','GCAP','GEL','GENCO','GFPT','GIFT','GJS','GL','GLAND','GLOBAL','GLOCON','GOLD','GOLDPF','GPI','GPSC','GRAND','GREEN','GSC','GSTEEL','GTB','GULF','GUNKUL','GVREIT','GYT','HANA','HARN','HFT','HREIT','ICC','ICHI','ICN','IFEC','ILM','IMPACT','INET','IP','IT','J','K','KC','KKC','M','MC','ML','MM','NC','NDR','NETBAY','NEW','NEWS','NEX','NFC','NINE','NMG','PE','PF','PG','PICO','PK','PL','PRO','PT','PTG','RAM','ROCK','RPC','RS','S','SC','SE','SMART','SR','STA','STAR','STC','T','TC','TH','THAI','TM','TR','TRT','TRUE','TSE','TSI','TTA','TU','U','UP','UT','VI','VL','WG','WORK','WP','WR']
Counter=0
for name in companyname:
Counter=Counter+1
print('Name:',name)
companyfield='//*[#id="input-search"]'
driver.find_element_by_xpath(companyfield).send_keys(name)
driver.find_element_by_xpath(companyfield).send_keys(u'\ue007')
if Counter == 1:
# Give some time to set web page ie qoq, from and to date, and Profit and Los or Balance sheet
time.sleep(30)
elif Counter==30:
time.sleep(20) # give web site a breather
driver.find_element_by_xpath('//*[#id="export-link"]').click()
time.sleep(3)
driver.find_element_by_xpath(companyfield).clear()
# Strangely, if over 100 xlsx downloaded , it plays up - so move out to other folder and restart counters
time.sleep(0.3)
if Counter==70:
files = glob.iglob(os.path.join("/Users/cg/Downloads", "*.xlsx"))
for file in files:
if os.path.isfile(file):
ts = time.time()
shutil.copy2(file, "/Users/cg/Python/junk/SETScrape/Output1/"+str(ts)+".xlsx")
files = glob.glob("/Users/cg/Downloads/*.xlsx")
for f in files:
os.remove(f)
Counter=2
I want to extract all the fantasy teams that have been entered for past contests. To loop through the dates, I just change a small part of the URL as shown in my code below:
#Packages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
# Driver
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
# Dataframe that will be use later
results = pd.DataFrame()
best_lineups=pd.DataFrame()
opti_lineups=pd.DataFrame()
#For loop over all DATES:
calendar=[]
calendar.append("2019-01-10")
calendar.append("2019-01-11")
for d in calendar:
driver.get("https://rotogrinders.com/resultsdb/date/"+d+"/sport/4/")
Then, to access the different contests of that day, you need to click on the contest tab. I use the following code to locate and click on it.
# Find "Contest" tab
contest= driver.find_element_by_xpath("//*[#id='root']/div/main/main/div[2]/div[3]/div/div/div[1]/div/div/div/div/div[3]")
contest.click()
I simply inspect and copy the xpath of the tab. However, most of the times it is working, but sometimes I get an error message " Unable to locate element...". Moreover, it seems to work only for the first date in my calendar loop and always fails in the next iteration... I do not know why. I try to locate it differently, but I feel I am missing something such as:
contests=driver.find_element_by_xpath("//*[#role='tab']
Once, the contest tab is successfully clicked, all contests of that day are there and you can click on a link to access all the entries of that contest. I stored the contests in order to iterate throuhg all as follow:
list_links = driver.find_elements_by_tag_name('a')
hlink=[]
for ii in list_links:
hlink.append(ii.get_attribute("href"))
sub="https://rotogrinders.com/resultsdb"
con= "contest"
contest_list=[]
for text in hlink:
if sub in text:
if con in text:
contest_list.append(text)
# Iterate through all the entries(user) of a contest and extract the information of the team entered by the user
for c in contest_list:
driver.get(c)
Then, I want to extract all participants team entered in the contest and store it in a dataframe. I am able to do it successfully for the first page of the contest.
# Waits until tables are loaded and has text. Timeouts after 60 seconds
while WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.XPATH, './/tbody//tr//td//span//a[text() != ""]'))):
# while ????:
# Get tables to get the user names
tables = pd.read_html(driver.page_source)
users_df = tables[0][['Rank','User']]
users_df['User'] = users_df['User'].str.replace(' Member', '')
# Initialize results dataframe and iterate through users
for i, row in users_df.iterrows():
rank = row['Rank']
user = row['User']
# Find the user name and click on the name
user_link = driver.find_elements(By.XPATH, "//a[text()='%s']" %(user))[0]
user_link.click()
# Get the lineup table after clicking on the user name
tables = pd.read_html(driver.page_source)
lineup = tables[1]
#print (user)
#print (lineup)
# Restructure to put into resutls dataframe
lineup.loc[9, 'Name'] = lineup.iloc[9]['Salary']
lineup.loc[10, 'Name'] = lineup.iloc[9]['Pts']
temp_df = pd.DataFrame(lineup['Name'].values.reshape(-1, 11),
columns=lineup['Pos'].iloc[:9].tolist() + ['Total_$', 'Total_Pts'] )
temp_df.insert(loc=0, column = 'User', value = user)
temp_df.insert(loc=0, column = 'Rank', value = rank)
temp_df["Date"]=d
results = results.append(temp_df)
#next_button = driver.find_elements_by_xpath("//button[#type='button']")
#next_button[2].click()
results = results.reset_index(drop=True)
driver.close()
However, there are other pages and to access it, you need to click on the small arrow next buttonat the bottom. Moreover, you can click indefinitely on that button; even if there are not more entries. Therefore, I would like to be able to loop through all pages with entries and stop when there are no more entries and change contest. I try to implement a while loop to do so, but my code did not work...
You must really make sure that page loads completely before you do anything on that page.
Moreover, it seems to work only for the first date in my calendar loop
and always fails in the next iteration
Usually when selenium loads a browser page it tries to look for the element even if it is not loaded all the way. I suggest you to recheck the xpath of the element you are trying to click.
Also try to see when the page loads completely and use time.sleep(number of seconds)
to make sure you hit the element or you can check for a particular element or a property of element that would let you know that the page has been loaded.
One more suggestion is that you can use driver.current_url to see which page are you targetting. I have had this issue while i was working on multiple tabs and I had to tell python/selenium to manually switch to that tab
I had some experience with coding before, but not specifically for web applications. I have been tasked with getting data from this website: http://www.b3.com.br/pt_br/market-data-e-indices/servicos-de-dados/market-data/consultas/mercado-de-derivativos/precos-referenciais/taxas-referenciais-bm-fbovespa/
They are avaliable on a day-to-day basis. I have used selenium in Python, and so far the results are good: I can get the entire table, store it in a pandas dataframe, and then to a mysql database and stuff. The problem is: the result from the website is always the same!
Here is my code:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
def GetDataFromWeb(day, month, year):
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
#had to use these two below because of webdriver crashing issues
options.add_argument('no-sandbox')
options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(chrome_options=options)
driver.get("http://www.b3.com.br/pt_br/market-data-e-indices/servicos-de-dados/market-data/consultas/mercado-de-derivativos/precos-referenciais/taxas-referenciais-bm-fbovespa/")
#the table is on an iframe
iframe = driver.find_element_by_id("bvmf_iframe")
driver.switch_to.default_content()
driver.switch_to.frame(iframe)
#getting to the place where I should input the data
date = driver.find_element_by_id("Data")
date.send_keys("/".join((str(day),str(month),str(year))))
date = driver.find_element_by_tag_name("button").click()
#I have put this wait just to be sure it doesn't try to get info from an unloaded page
time.sleep(5)
page = bs(driver.page_source,"html.parser")
table = page.find(id='tb_principal1')
headers = ['Dias Corridos', '252','360']
matrix = []
for rows in table.select('tr')[2:]:
values = []
for columns in rows.select('td'):
values.append(columns.text.replace(',','.'))
matrix.append(values)
df = pd.DataFrame(data=matrix, columns=headers)
driver.close()
#only the first 2 columns are interesting for my purposes
return df.iloc[:,0:2]
The table resulting from this function is always the same, no matter what inputs I send to it. And they seem to be from the corresponding date of 06/09/2018 (month=09,day=06). I think the main problem is that I don't know how the queries to their database is done, so this always runs like a "default date". I have read some people talking about Ajax and JavaScript requests, but I don't know if that's the case. How can I tell?
This code will work(updated few lines in your code)
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import pandas as pd
def GetDataFromWeb(day, month, year):
***#to avoid data error in date handler***
if month < 10:
month="0"+str(month)
if day < 10:
day="0"+str(day)
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
#had to use these two below because of webdriver crashing issues
options.add_argument('no-sandbox')
options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(chrome_options=options)
driver.get("http://www.b3.com.br/pt_br/market-data-e-indices/servicos-de-dados/market-data/consultas/mercado-de-derivativos/precos-referenciais/taxas-referenciais-bm-fbovespa/")
#the table is on an iframe
iframe = driver.find_element_by_id("bvmf_iframe")
driver.switch_to.default_content()
driver.switch_to.frame(iframe)
#getting to the place where I should input the data
date = driver.find_element_by_id("Data")
date.clear() ***#to clear auto populated data***
date.send_keys(((str(day),str(month),str(year)))) ***# removed the join part***
driver.find_element_by_tag_name("button").click()
#I have put this wait just to be sure it doesn't try to get info from an unloaded page
time.sleep(50)
page = bs(driver.page_source,"html.parser")
table = page.find(id='tb_principal1')
headers = ['Dias Corridos', '252','360']
matrix = []
for rows in table.select('tr')[2:]:
values = []
for columns in rows.select('td'):
values.append(columns.text.replace(',','.'))
matrix.append(values)
df = pd.DataFrame(data=matrix, columns=headers)
driver.close()
#only the first 2 columns are interesting for my purposes
return df.iloc[:,0:2]
print GetDataFromWeb(3,9,2018)
It will print the matching data for the required date.
I have added #to avoid data error in date handler
if month < 10:
month="0"+str(month)
if day < 10:
day="0"+str(day)
date.clear() #to clear auto populated data
date.send_keys(((str(day),str(month),str(year)))) # removed the join part
Note the problem in your code was the date& month fields take two digit number and date.send_keys("/".join((str(day), str(month), str(year)))) line was generating an error because of which the system date was picked and you always see same data coming for any input data. Also when you click on the date it was picking default date so first, we have to clear that and send custom date. Hope this helps
Update for additional query: Add these imports
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Add this line in place of wait
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#divContainerIframeBmf > form > div > div > div:nth-child(1) > div:nth-child(3) > div > div > p')))