I am using Selenium in Python to download the same file, but with different inputs, each time. So for example, I download data with country selection, "China." In the next iteration, I download the same data, but for country "Brazil."
I am struggling to find easy to understand syntax I can use to rename the downloaded files. The files are currently downloading as "Data.csv" and Data(1).csv." What I want is to have "China-Data.csv" and "Brazil-Data.csv."
The only relevant code I have constructed for this is:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
ChromeOptions=webdriver.ChromeOptions()
driver =webdriver.Chrome('Users/yu/Downloads/chromedriver')
inputcountry.send_keys('China')
inputcountry.send_keys(Keys.RETURN)
I read through this post, but I don't know how to create a forloop that can adapt this to the issue of files having the same name but with numbers at the end. EX: Data(1).csv, Data(2).csv, Data(3).csv
Thanks
Since you know the name of the download file, you can rename as you go. It can be tricky to know when a download completes, so I used a polling method.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import shutil
download_file = os.path.expanduser("~/Downloads/Data.csv")
save_to_template = os.path.expanduser("~/Documents/Data-{}.csv")
# remove stale files
if os.path.isfile(download_file):
os.remove(download_file)
ChromeOptions=webdriver.ChromeOptions()
driver =webdriver.Chrome('Users/yu/Downloads/chromedriver')
countries = ['China', 'Malaysia', 'Brazil']
for country in countries:
inputcountry.send_keys(country)
inputcountry.send_keys(Keys.RETURN)
# one option is to poll for file showing up.... assuming file
# is renamed when done
for s in range(60): # give it a minute
if os.path.exists(download_file):
shutil.move(download_file, save_to_template.format(country))
break
else:
raise TimeoutError("could not download {}".format(country))
If you know the order of your files (i.e. you know that Data(1) should be named China-Data, Data(2) should be named Brazil-Data, etc.), then you just need to use a list and rename all the files according to it.
import os
directory = 'Users/yu/Downloads/chromedriver/'
correct_names = ['China-Data.csv','Brazil-Data.csv']
def rename_files(directory: str, correct_names: list) -> None:
# change the name of each file in the directory
for i, filename in enumerate(sorted(os.listdir(directory))):
src = directory + filename
dst = directory + correct_names[i]
os.rename(src, dst)
Every time you do inputcountry.send_keys('China'), you can add to the correct_names list whatever input you are giving, like correct_names.append('China-Data.csv').
You may call rename_files at the end with the correct_names list.
I'm using Selenium to grab a screenshot from a list of urls. test.txt include reddit.com, stackoverflow.com and spotify.com. When iterating through this list I want it to save in the folder Screenshots with the file name being the url + '.png'. It does not work though. I'm either getting errors or it just keeps running without doing anything.
This one works but it just overwrites the old one
screenshot = driver.save_screenshot('Screenshots/foo.png')
I want it to look like this but it does not work:
screenshot = driver.save_screenshot('Screenshots/', line, '.png')
I am new to python but it doesn't work using + instead ' either.
The problem is that it takes too many arguments.
class Screenshot():
filehandle = open("test.txt", "r")
for line in filehandle:
DRIVER = 'chromedriver'
driver = webdriver.Chrome(DRIVER)
driver.get(line)
screenshot = driver.save_screenshot('Screenshots/foo.png')
driver.quit()
Creating a screenshot class is unnecessary for a simple task like this.
#!/usr/bin/env python
from __future__ import print_function
import os
from selenium import webdriver
def main():
driver = webdriver.Chrome()
# With automatically closes files when they go out of scope
with open('test.txt', 'r') as f:
for url in f.readlines():
driver.get(url)
# os.path.join should make it platform agnostic
# Also remove any '/' from the url and replace to avoid any file system save issues
sn_name = os.path.join('Screenshots', url.strip().replace('/', '-') + '.png')
print('Attempting to save:', sn_name)
# '.save_screenshot' returns false if it fails so throw exception
if not driver.save_screenshot(sn_name):
raise Exception('Could not save screen shot: ' + sn_name)
# Close browser
driver.quit()
if __name__ == '__main__':
main()
I am trying to create a script that scrapes a webpage and downloads any image files found.
My first function is a wget function that reads the webpage and assigns it to a variable.
My second function is a RegEx that searches for the 'ssrc=' in a webpages html, below is the function:
def find_image(text):
'''Find .gif, .jpg and .bmp files'''
documents = re.findall(r'\ssrc="([^"]+)"', text)
count = len(documents)
print "[+] Total number of file's found: %s" % count
return '\n'.join([str(x) for x in documents])
The output from this is something like this:
example.jpg
image.gif
http://www.webpage.com/example/file01.bmp
I am trying to write a third function that downloads these files using urllib.urlretrieve(url, filename) but I am not sure how to go about this, mainly because some of the output is absolute paths where as others are relative. I am also unsure how to download these all at same time and download without me having to specify a name and location every time.
Path-Agnostic fetching of resources (Can handle absolute/relative paths) -
from bs4 import BeautifulSoup as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
def fetch_url(url, out_folder="test/"):
"""Downloads all the images at 'url' to /test/"""
soup = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in soup.findAll("img"):
print "Image: %(src)s" % image
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
fetch_url('http://www.w3schools.com/html/')
I can't write you the complete code and I'm sure that's not what you would want as well, but here are some hints:
1) Do not parse random HTML pages with regex, there are quite a few parsers made for that. I suggest BeautifulSoup. You will filter all img elements and get their src values.
2) With the src values at hand, you download your files the way you are already doing. About the relative/absolute problem, use the urlparse module, as per this SO answer. The idea is to join the src of the image with the URL from which you downloaded the HTML. If the src is already absolute, it will remain that way.
3) As for downloading them all, simply iterate over a list of the webpages you want to download images from and do steps 1 and 2 for each image in each page. When you say "at the same time", you probably mean to download them asynchronously. In that case, I suggest downloading each webpage in one thread.
I am working with a selenium script where I am trying to download a Excel file and give it a specific name. This is my code:
Is there anyway that I can give the file being downloaded a specific name ?
Code:
#!/usr/bin/python
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
profile = FirefoxProfile()
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain, application/vnd.ms-excel, text/csv, text/comma-separated-values, application/octet-stream")
profile.set_preference("browser.download.dir", "C:\\Downloads" )
browser = webdriver.Firefox(firefox_profile=profile)
browser.get('https://test.com/')
browser.find_element_by_partial_link_text("Excel").click() # Download file
Here is another simple solution, where you can wait until the download completed and then get the downloaded file name from chrome downloads.
Chrome:
# method to get the downloaded file name
def getDownLoadedFileName(waitTime):
driver.execute_script("window.open()")
# switch to new tab
driver.switch_to.window(driver.window_handles[-1])
# navigate to chrome downloads
driver.get('chrome://downloads')
# define the endTime
endTime = time.time()+waitTime
while True:
try:
# get downloaded percentage
downloadPercentage = driver.execute_script(
"return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('#progress').value")
# check if downloadPercentage is 100 (otherwise the script will keep waiting)
if downloadPercentage == 100:
# return the file name once the download is completed
return driver.execute_script("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content #file-link').text")
except:
pass
time.sleep(1)
if time.time() > endTime:
break
Firefox:
def getDownLoadedFileName(waitTime):
driver.execute_script("window.open()")
WebDriverWait(driver,10).until(EC.new_window_is_opened)
driver.switch_to.window(driver.window_handles[-1])
driver.get("about:downloads")
endTime = time.time()+waitTime
while True:
try:
fileName = driver.execute_script("return document.querySelector('#contentAreaDownloadsView .downloadMainArea .downloadContainer description:nth-of-type(1)').value")
if fileName:
return fileName
except:
pass
time.sleep(1)
if time.time() > endTime:
break
Once you click on the download link/button, just call the above method.
# click on download link
browser.find_element_by_partial_link_text("Excel").click()
# get the downloaded file name
latestDownloadedFileName = getDownLoadedFileName(180) #waiting 3 minutes to complete the download
print(latestDownloadedFileName)
JAVA + Chrome:
Here is the method in java.
public String waitUntilDonwloadCompleted(WebDriver driver) throws InterruptedException {
// Store the current window handle
String mainWindow = driver.getWindowHandle();
// open a new tab
JavascriptExecutor js = (JavascriptExecutor)driver;
js.executeScript("window.open()");
// switch to new tab
// Switch to new window opened
for(String winHandle : driver.getWindowHandles()){
driver.switchTo().window(winHandle);
}
// navigate to chrome downloads
driver.get("chrome://downloads");
JavascriptExecutor js1 = (JavascriptExecutor)driver;
// wait until the file is downloaded
Long percentage = (long) 0;
while ( percentage!= 100) {
try {
percentage = (Long) js1.executeScript("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('#progress').value");
//System.out.println(percentage);
}catch (Exception e) {
// Nothing to do just wait
}
Thread.sleep(1000);
}
// get the latest downloaded file name
String fileName = (String) js1.executeScript("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content #file-link').text");
// get the latest downloaded file url
String sourceURL = (String) js1.executeScript("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content #file-link').href");
// file downloaded location
String donwloadedAt = (String) js1.executeScript("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div.is-active.focus-row-active #file-icon-wrapper img').src");
System.out.println("Download deatils");
System.out.println("File Name :-" + fileName);
System.out.println("Donwloaded path :- " + donwloadedAt);
System.out.println("Downloaded from url :- " + sourceURL);
// print the details
System.out.println(fileName);
System.out.println(sourceURL);
// close the downloads tab2
driver.close();
// switch back to main window
driver.switchTo().window(mainWindow);
return fileName;
}
This is how to call this in your java script.
// download triggering step
downloadExe.click();
// now waituntil download finish and then get file name
System.out.println(waitUntilDonwloadCompleted(driver));
Output:
Download deatils
File Name :-RubyMine-2019.1.2 (7).exe
Donwloaded path :- chrome://fileicon/C%3A%5CUsers%5Csupputuri%5CDownloads%5CRubyMine-2019.1.2%20(7).exe?scale=1.25x
Downloaded from url :- https://download-cf.jetbrains.com/ruby/RubyMine-2019.1.2.exe
RubyMine-2019.1.2 (7).exe
You cannot specify name of download file through selenium. However, you can download the file, find the latest file in the downloaded folder, and rename as you want.
Note: borrowed methods from google searches may have errors. but you get the idea.
import os
import shutil
filename = max([Initial_path + "\\" + f for f in os.listdir(Initial_path)],key=os.path.getctime)
shutil.move(filename,os.path.join(Initial_path,r"newfilename.ext"))
Hope this snippet is not that confusing. It took me a while to create this and is really useful, because there has not been a clear answer to this problem, with just this library.
import os
import time
def tiny_file_rename(newname, folder_of_download):
filename = max([f for f in os.listdir(folder_of_download)], key=lambda xa : os.path.getctime(os.path.join(folder_of_download,xa)))
if '.part' in filename:
time.sleep(1)
os.rename(os.path.join(folder_of_download, filename), os.path.join(folder_of_download, newname))
else:
os.rename(os.path.join(folder_of_download, filename),os.path.join(folder_of_download,newname))
Hope this saves someone's day, cheers.
EDIT: Thanks to #Om Prakash editing my code, it made me remember that I didn't explain the code thoughly.
Using the max([]) function could lead to a race condition, leaving you with empty or corrupted file(I know it from experience). You want to check if the file is completely downloaded in the first place. This is due to the fact that selenium don't wait for the file download to complete, so when you check for the last created file, an incomplete file will show up on your generated list and it will try to move that file. And even then, you are better off waiting a little bit for the file to be free from Firefox.
EDIT 2: More Code
I was asked if 1 second was enough time and mostly it is, but in case you need to wait more than that you could change the above code to this:
import os
import time
def tiny_file_rename(newname, folder_of_download, time_to_wait=60):
time_counter = 0
filename = max([f for f in os.listdir(folder_of_download)], key=lambda xa : os.path.getctime(os.path.join(folder_of_download,xa)))
while '.part' in filename:
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait:
raise Exception('Waited too long for file to download')
filename = max([f for f in os.listdir(folder_of_download)], key=lambda xa : os.path.getctime(os.path.join(folder_of_download,xa)))
os.rename(os.path.join(folder_of_download, filename), os.path.join(folder_of_download, newname))
There is something i would correct for #parishodak answer:
the filename here will only return the relative path (here the name of the file) not the absolute path.
That is why #FreshRamen got the following error after:
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/genericpath.py",
line 72, in getctime return os.stat(filename).st_ctime OSError:
[Errno 2] No such file or directory: '.localized'
There is the correct code:
import os
import shutil
filepath = 'c:\downloads'
filename = max([filepath +"\"+ f for f in os.listdir(filepath)], key=os.path.getctime)
shutil.move(os.path.join(dirpath,filename),newfilename)
I've come up with a different solution. Since you only care about the last downloaded file, then why not download it into a dummy_dir? So that, that file is going to be the only file in that directory. Once it's downloaded, you can move it to your destination_dir as well as changing it's name.
Here is an example that works with Firefox:
def rename_last_downloaded_file(dummy_dir, destination_dir, new_file_name):
def get_last_downloaded_file_path(dummy_dir):
""" Return the last modified -in this case last downloaded- file path.
This function is going to loop as long as the directory is empty.
"""
while not os.listdir(dummy_dir):
time.sleep(1)
return max([os.path.join(dummy_dir, f) for f in os.listdir(dummy_dir)], key=os.path.getctime)
while '.part' in get_last_downloaded_file_path(dummy_dir):
time.sleep(1)
shutil.move(get_last_downloaded_file_path(dummy_dir), os.path.join(destination_dir, new_file_name))
You can fiddle with the sleep time and add a TimeoutException as well, as you see fit.
Here is the code sample I used to download pdf with a specific file name. First you need to configure chrome webdriver with required options. Then after clicking the button (to open pdf popup window), call a function to wait for download to finish and rename the downloaded file.
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# function to wait for download to finish and then rename the latest downloaded file
def wait_for_download_and_rename(newFilename):
# function to wait for all chrome downloads to finish
def chrome_downloads(drv):
if not "chrome://downloads" in drv.current_url: # if 'chrome downloads' is not current tab
drv.execute_script("window.open('');") # open a new tab
drv.switch_to.window(driver.window_handles[1]) # switch to the new tab
drv.get("chrome://downloads/") # navigate to chrome downloads
return drv.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
# wait for all the downloads to be completed
dld_file_paths = WebDriverWait(driver, 120, 1).until(chrome_downloads) # returns list of downloaded file paths
# Close the current tab (chrome downloads)
if "chrome://downloads" in driver.current_url:
driver.close()
# Switch back to original tab
driver.switch_to.window(driver.window_handles[0])
# get latest downloaded file name and path
dlFilename = dld_file_paths[0] # latest downloaded file from the list
# wait till downloaded file appears in download directory
time_to_wait = 20 # adjust timeout as per your needs
time_counter = 0
while not os.path.isfile(dlFilename):
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait:
break
# rename the downloaded file
shutil.move(dlFilename, os.path.join(download_dir,newFilename))
return
# specify custom download directory
download_dir = r'c:\Downloads\pdf_reports'
# for configuring chrome pdf viewer for downloading pdf popup reports
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_dir, # Set own Download path
"download.prompt_for_download": False, # Do not ask for download at runtime
"download.directory_upgrade": True, # Also needed to suppress download prompt
"plugins.plugins_disabled": ["Chrome PDF Viewer"], # Disable this plugin
"plugins.always_open_pdf_externally": True, # Enable this plugin
})
# get webdriver with options for configuring chrome pdf viewer
driver = webdriver.Chrome(options = chrome_options)
# open desired webpage
driver.get('https://mywebsite.com/mywebpage')
# click the button to open pdf popup
driver.find_element_by_id('someid').click()
# call the function to wait for download to finish and rename the downloaded file
wait_for_download_and_rename('My file.pdf')
# close the browser windows
driver.quit()
Set timeout (120) to the wait time as per your needs.
I am using the following function.
It checks for a file in the download location that you specify for chrome/selenium, and only is there is a file created as maxium 10 seconds ago (max_old_time), it renames it. Otherwise, it wait a maxium of 60 seconds (max_waiting_time)..
Not sure if is the best way, but it worked for me..
import os, shutil, time
from datetime import datetime
def rename_last_file(download_folder,destination_folder,newfilename):
#Will wait for maxium max_waiting_time seconds for a new in folder.
max_waiting_time=60
#Will rename only is the file creation has less than max_old_stime seconds.
max_old_time=10
start_time=datetime.now().timestamp()
while True:
filelist=[]
last_file_time=0
for current_file in os.listdir(download_folder):
filelist.append(current_file)
current_file_fullpath=os.path.join(download_folder, current_file)
current_file_time=os.path.getctime(current_file_fullpath)
if os.path.isfile(current_file_fullpath):
if last_file_time==0:
last_file=current_file
last_file_time=os.path.getctime(os.path.join(download_folder, last_file))
if current_file_time>last_file_time and os.path.isfile(current_file_fullpath):
last_file=current_file
last_file_fullpath=os.path.join(download_folder, last_file)
if start_time-last_file_time<max_old_time:
shutil.move(last_file_fullpath,os.path.join(destination_folder,newfilename))
print(last_file_fullpath)
return(0)
elif (datetime.now().timestamp()-start_time)>max_waiting_time:
print("exit")
return(1)
else:
print("waiting file...")
time.sleep(5)
Using #dmb 's trick. Ive just made one correction: after .part control, below time.sleep(1) we must request filename again. Otherwise, the line below will try to rename a .part file, which no more exists.
Here is a browser-agnostic solution that waits for the download to finish then returns the file name.
from datetime import datetime, timedelta
def wait_for_download_and_get_file_name():
print(f'Waiting for download to finish', end='')
while True:
# Get the name of the file with the latest creation time
newest_file_name = max([os.path.join(DOWNLOAD_DIR, f) for f in os.listdir(DOWNLOAD_DIR)], key=os.path.getctime)
# Get the creation time of the file
file_creation_time = datetime.fromtimestamp(os.path.getctime(newest_file_name))
five_seconds_ago = datetime.now() - timedelta(seconds=5)
if file_creation_time < five_seconds_ago:
# The file with the latest creation time is too old to be the file that we're waiting for
print(f'.', end='')
time.sleep(0.5)
else:
print(f'\nFinished downloading "{newest_file_name}"')
break
return newest_file_name
Caveat: this will not work if you have more than one thread or process downloading files to the same directory at the same time.
In my case i downloading and rename .csv files, also i using as a reference files that has '__' in the title, but you can change '_' for your specific usage.
Add this block after download on your selenium script.
string = 'SOMETHING_OR_VARIABLE'
path = r'PATH_WHERE_FILE_ARE_BEING_DOWNLOAD'
files = [i for i in os.listdir(path) if os.path.isfile(os.path.join(path,i)) and \
'_' in i]
if files != []:
import os
files = [i for i in os.listdir(path) if os.path.isfile(os.path.join(path,i)) and \
'_' in i]
print(files[0])
os.rename(path + '\\' +files[0], path + '\\' +f'{string}.csv')
else:
print('error')
You can download the file and name it at the same time using urlretrieve:
import urllib
url = browser.find_element_by_partial_link_text("Excel").get_attribute('href')
urllib.urlretrieve(url, "/choose/your/file_name.xlsx")
i am trying to lunch multiple urls from a text file in Firefox via python. i am using win7 OS & python 3. i need some direction to pass the argument to Firefox.
import os
import subprocess
f = open ('C:\\Users\\test\\Desktop\\urls.txt','r')
data = f.read()
print(data)
# i need some help here to pass this argument to Firefox.
f.close()
urls.txt
http://www.abc.com
http://www.xyz.com/test
http://www.abc.net/test.html
http://www.test.com
http://www.msn.com
Use the webbrowser module.
import webbrowser
firefox = webbrowser.get('firefox')
for url in data.split('\n'):
firefox.open_new_tab(url)
If you don't want to enforce a particular browser and just start the default one, use webbrowser.open_new_tab.
The webbrowser module isn't very reliable, especially on Windows, so you might have to start the process manually using the subprocess module:
import subprocess
firefox_path = 'C:/Program Files/Firefox/firefox' # change this line accordingly
for url in data.split('\n'):
subprocess.Popen([firefox_path, url])
Also, Firefox supports multiple URLs in the command line, so the following solution is better for it:
import subprocess
urls = open('C:/Users/test/Desktop/urls.txt').read().split('\n')
subprocess.Popen(['C:/Program Files/Firefox/firefox']+urls)