Downloading multiple files from a url using selenium and renaming them

Downloading multiple files from a url using selenium and renaming them - python

The code below is attempting to download data from a link that is modified by inserting the "tickers" list into a URL
I tried putting this within the for loop of URL but it produces an error because it tries to go through the entire tickers list.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
import time
import pandas as pd
import datetime
from datetime import datetime
start = '10/26/2020'
end = '1/22/2023'
tickers = ["ap","dmc","creit","chib","fli","fb","dmc","fph","gma7","ltg",
"mbt",",mreit","nikl","pse","rcr","rlc","rrhi","scc","secb"]
urls = [(f'https://www.wsj.com/market-data/quotes/PH/{ticker}/historical-prices') for ticker in tickers]
path = "/Users/sef/Documents/Py-MSC/chromedriver_mac_arm64/chromedriver"
options = Options()
options.add_experimental_option('detatch', True)
s = Service(path)
chromeOptions = webdriver.ChromeOptions()
folder = "/Users/sef/Documents/PSE_Data Repository"
prefs = {"download.default_directory" : folder}
chromeOptions.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(service=s, options=chromeOptions)
wait = WebDriverWait(driver, 10)
for url in urls:
driver.get(url)
time.sleep(4)
beg_date = driver.find_element("xpath", '//*[#id="selectDateFrom"]')
beg_date.clear()
beg_date.send_keys(start)
end_date = driver.find_element("xpath", '//*[#id="selectDateTo"]')
end_date.clear()
end_date.send_keys(end)
driver.find_element("xpath", '//*[#id="datPickerButton"]').click()
driver.find_element("xpath", '//*[#id="dl_spreadsheet"]').click()
time.sleep(5)
for ticker in tickers:
label = ticker.upper()
old = r'/Users/sef/Documents/PSE_Data Repository/HistoricalPrices.csv'
new = f'/Users/sef/Documents/PSE_Data Repository/{label}.csv'
os.rename(old, new)
How can I modify this so that it will only use the first item within the tickers list and then move on to the next item for the next loop of the URL.
for ticker in tickers:
label = ticker.upper()
old = r'/Users/sef/Documents/PSE_Data Repository/HistoricalPrices.csv'
new = f'/Users/sef/Documents/PSE_Data Repository/{label}.csv'
os.rename(old, new)

I'm not familiar with Selenium, but it seems that if you restructure your for loop to loop over the tickers instead of the urls you can achieve your results.
I think your problem is that each loop that fetches data creates a file called 'HistoricalPrices.csv' and it gets overwritten each iteration. You need to perform the rename after each file download before the next iteration of the loop. To do this, loop over the tickers instead of the urls.
for ticker in tickers:
# get data & download file
url = f'https://www.wsj.com/market-data/quotes/PH/{ticker}/historical-prices'
driver.get(url)
time.sleep(4)
beg_date = driver.find_element("xpath", '//*[#id="selectDateFrom"]')
beg_date.clear()
beg_date.send_keys(start)
end_date = driver.find_element("xpath", '//*[#id="selectDateTo"]')
end_date.clear()
end_date.send_keys(end)
driver.find_element("xpath", '//*[#id="datPickerButton"]').click()
driver.find_element("xpath", '//*[#id="dl_spreadsheet"]').click()
time.sleep(5)
# rename file
label = ticker.upper()
old = r'/Users/sef/Documents/PSE_Data Repository/HistoricalPrices.csv'
new = f'/Users/sef/Documents/PSE_Data Repository/{label}.csv'
os.rename(old, new)
Hopefully this solves your problem.

Related

Append data wrong in csv file

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...

You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.

To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))

The problem there is that the selector depends on whether the window is expanded or not

iterate over a list in Selenium and saving the results into a dataframe

I'm trying to iterate over a list, search on a webpage via selenium and store the results in a df. How can store the loop results from each list item into a df?
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
url = 'https://au.finance.yahoo.com/australia/'
driver_path = 'chromedriver.exe'
browser = Chrome(executable_path= driver_path)
loop_search = browser.find_element_by_id('yfin-usr-qry')
search_companies = ['Commonwealth Bank','Rio Tinto','Wesfarmers']
for i in search_companies:
loop_search.send_keys(i)
browser.find_element_by_id('search-button').click()
comp = browser.find_element_by_id('quote-header-info').text
df3 = [comp]```
Still fairly new to Python! Thank you!

If you just run your code and do print(comp)
you'd see the below error:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=99.0.4844.74)
so before saving it to the DF, we need to resolve this:
that can be fixed by redefining the web element like this in the loop:
loop_search = wait.until(EC.visibility_of_element_located((By.ID, "yfin-usr-qry")))
Full code to save it to DF:
driver_path = 'chromedriver.exe'
browser = Chrome(executable_path= driver_path)
wait = WebDriverWait(driver, 20)
url = 'https://au.finance.yahoo.com/australia/'
driver.get(url)
search_companies = ['Commonwealth Bank','Rio Tinto','Wesfarmers']
company_details_lst = []
for i in search_companies:
time.sleep(2)
loop_search = wait.until(EC.visibility_of_element_located((By.ID, "yfin-usr-qry")))
loop_search.send_keys(i)
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.ID, "search-button"))).click()
time.sleep(2)
comp = wait.until(EC.element_to_be_clickable((By.ID, "quote-header-info"))).text
company_details_lst.append(comp)
#print(comp)
data = {
'Details': company_details_lst
}
df = pd.DataFrame.from_dict(data)
df.to_csv('output.csv', index = 0
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
after running your code you should see a csv file in your project folder with name as output.csv
and the internal content would be:

Selenium .get_attribute() is returning None

My target website is https://bscscan.com/address/0xb66e947f49b6811a8bf438040d1582232d3232d7#tokentxns
this text appears when you hover the mouse over time info
I am trying to get the exact date of the last transaction in here (age) when you hover the mouse it appears, therefore I need to get the 'title data-original-title' attribute because it has the date can be seen here but when I try to get the date with this code it prints None
data = driver.find_element_by_xpath('//*[#id="body"]/div[3]/table/tbody/tr[1]/td[3]/span').get_attribute('title data-original-title')
print(data)
driver_path = "browser/chromedriver.exe"
partial_website_link = "b66e947f49b6811a8bf438040d1582232d3232d7"
final_website_link = f"https://bscscan.com/address/0x{partial_website_link}#tokentxns"
driver = webdriver.Chrome(driver_path)
driver.get(final_website_link)
time.sleep(3)
frame = driver.find_element_by_xpath('//*[#id="tokenpageiframe"]')
driver.switch_to.frame(frame)
data = driver.find_element_by_xpath('//*[#id="body"]/div[3]/table/tbody/tr[1]/td[3]/span').get_attribute('title data-original-title')
print(data)

Try this:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.action_chains import ActionChains
driver_path = "browser/chromedriver.exe"
partial_website_link = "b66e947f49b6811a8bf438040d1582232d3232d7"
final_website_link = f"https://bscscan.com/address/0x{partial_website_link}#tokentxns"
driver = webdriver.Chrome(driver_path)
driver.get(final_website_link)
wait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it(driver.find_element_by_xpath('//*[#id="tokenpageiframe"]')))
time.sleep(1)
ages = driver.find_elements_by_xpath(`//div[#class="table-responsive" and not(#style)]//td[#class="showAge "]//span`)
latest_age = ages[0].get_attribute("title data-original-title")
hover = ActionChains(driver).move_to_element(latest_age)
hover.perform()
tool_tip = driver.find_element_by_xpath('//div[#class="tooltip-inner"]')
age = tool_tip.text
print(age)

After a few tries and expanding your question, I came up with the following working example:
from selenium import webdriver
driver_path = "browser/chromedriver.exe"
driver = webdriver.Chrome(driver_path)
partial_website_link = "b66e947f49b6811a8bf438040d1582232d3232d7"
final_website_link = f"https://bscscan.com/address/0x{partial_website_link}#tokentxns"
driver.get(final_website_link)
frame = driver.find_element_by_xpath('//*[#id="tokenpageiframe"]')
driver.switch_to.frame(frame)
# For some reason the Info you want to print is in the attribute title
data = driver.find_element_by_xpath("""/html/body/div[3]/table/tbody/tr[1]/td[3]/span""").get_attribute('title')
print(data)
If you want to get all the Dates from the first page do this:
from selenium import webdriver
driver_path = "browser/chromedriver.exe"
driver = webdriver.Chrome(driver_path)
partial_website_link = "b66e947f49b6811a8bf438040d1582232d3232d7"
final_website_link = f"https://bscscan.com/address/0x{partial_website_link}#tokentxns"
driver.get(final_website_link)
frame = driver.find_element_by_xpath('//*[#id="tokenpageiframe"]')
driver.switch_to.frame(frame)
# The star in the Xpath is a wildcard, Note the find_elements_by_xpath instead of find_element_by_xpath
dates = driver.find_elements_by_xpath("""//*[#id="body"]/div[3]/table/tbody/tr[*]/td[3]/span""")
# Iterate over every found element
for date in dates:
print(date.get_attribute('title'))

See if this works:-
tableRows = driver.find_elements_by_xpath(".//table[#class='table table-hover']/tbody/tr")
for tr in tableRows:
print(tr.find_element_by_xpath(".//td[#class='showAge ']/span[#rel='tooltip']").get_attribute("data-original-title"))

How to print the open pdf link from using Selenium in Python?

I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link

You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on

I would say you just need to put this line:
pdf_link = elm.get_attribute("href")

Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Downloading multiple files from a url using selenium and renaming them - python

Related

Append data wrong in csv file

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

iterate over a list in Selenium and saving the results into a dataframe

Selenium .get_attribute() is returning None

How to print the open pdf link from using Selenium in Python?

Categories

Resources