Python Selenium download images (jpeg, png) or PDF using ChromeDriver - python

I have a Selenium script in Python (using ChromeDriver on Windows) that fetches the download links of various attachments(of different file types) from a page and then opens these links to download the attachments. This works fine for the file types which ChromeDriver can't preview as they get downloaded by default. But images(JPEG, PNG) and PDFs are previewed by default and hence aren't automatically downloaded.
The ChromeDriver options I am currently using (work for non preview-able files) :
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : 'custom_download_dir'}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome("./chromedriver.exe", chrome_options=chrome_options)
This downloads the files to 'custom_download_dir', no issues. But the preview-able files are just previewed in the ChromeDriver instance and not downloaded.
Are there any ChromeDriver Settings that can disable this preview behavior and directly download all files irrespective of the extensions?
If not, can this be done using Firefox for instance?

Instead of relying in specific browser / driver options I would implement a more generic solution using the image url to perform the download.
You can get the image URL using similar code:
driver.find_element_by_id("your-image-id").get_attribute("src")
And then I would download the image using, for example, urllib.
Here's some pseudo-code for Python2:
import urllib
url = driver.find_element_by_id("your-image-id").get_attribute("src")
urllib.urlretrieve(url, "local-filename.jpg")
Here's the same for Python3:
import urllib.request
url = driver.find_element_by_id("your-image-id").get_attribute("src")
urllib.request.urlretrieve(url, "local-filename.jpg")
Edit after the comment, just another example about how to download a file once you know its URL:
import requests
from PIL import Image
from io import StringIO
image_name = 'image.jpg'
url = 'http://example.com/image.jpg'
r = requests.get(url)
i = Image.open(StringIO(r.content))
i.save(image_name)

With selenium-wire library, it is possible to download images via ChromeDriver.
I have defined the following function to parse each request and save the request body to a file when necessary.
import os
from mimetypes import guess_extension
from seleniumwire import webdriver
def download_assets(requests, asset_dir="temp", default_fname="untitled", exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".ico"]):
asset_list = {}
for req_idx, request in enumerate(requests):
# request.headers
# request.response.body is the raw response body in bytes
ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())
if ext is None or ext not in exts:
#Don't know the file extention, or not in the whitelist
continue
# Construct a filename
fname = os.path.basename(request.url.split('?')[0])
fname = "".join(x for x in fname if (x.isalnum() or x in "._- "))
if fname == "":
fname = f"{default_fname}_{req_idx}"
if not fname.endswith(ext):
fname = f"{fname}{ext}"
fpath = os.path.join(asset_dir, fname)
# Save the file
print(f"{request.url} -> {fpath}")
asset_list[fpath] = request.url
with open(fpath, "wb") as file:
file.write(request.response.body)
return asset_list
Let's download some images from Google homepage to temp folder.
# Create a new instance of the Chrome/Firefox driver
driver = webdriver.Chrome()
# Go to the Google home page
driver.get('https://www.google.com')
# Download content to temp folder
asset_dir = "temp"
os.makedirs(asset_dir, exist_ok=True)
download_assets(driver.requests, asset_dir=asset_dir)
driver.close()
Note that the function can be improved such that the directory structure can be kept as well.

Here is another simple way, but #Pitto's answer above is slightly more succinct.
import requests
webelement_img = ff.find_element(By.XPATH, '//img')
url = webelement_img.get_attribute('src') or 'https://someimages.com/path-to-image.jpg'
data = requests.get(url).content
local_filename = 'filename_on_your_computer.jpg'
with open (local_filename, 'wb') as f:
f.write(data)

Related

Downloading image with Python using selenium, requests and Pillow won't work

I'm following a tutorial to download images with python of google chrome. I'm in the first step of downloading one image but it's not downloading anything however it does print Success. Both when I give a path and also when I leave the path like " " this to download it within my IDE I get nothing.
this is the code:
import requests
from PIL import Image
from selenium import webdriver
import io
PATH = "/Users/flaviamadau/Desktop/SchoolPython/chromedriver"
wd = webdriver
driver = webdriver.Chrome(executable_path=PATH)
image_url = "https://target.scene7.com/is/image/Target/GUEST_34ac5146-0911-4c3f-ae9f-473d268ad847"
download_paths= "/Users/flaviamadau/Desktop/SchoolPython/fotos"
def download_image(download_path, url, file_name):
try:
image_content = requests.get(url).content
image_file= io.BytesIO(image_content)
image = Image.open(image_file)
file_path = download_path + file_name
with open(file_path, "wb") as f:
image.save(f, "JPEG")
print("Success")
except Exception as e:
print('Failed -', e)
download_image(download_paths, image_url, "test.jpg")
any help or feedback is greatly appreciated!
To download image, simply save the .content from requests.get() to a binary file. For example:
import requests
url = "https://target.scene7.com/is/image/Target/GUEST_34ac5146-0911-4c3f-ae9f-473d268ad847"
with open("test.jpg", "wb") as f_out:
f_out.write(requests.get(url).content)
Saves test.jpg:
andrej#andrej:~/$ ls -l test.jpg
-rw-r--r-- 1 root root 8008 june 6 14:20 test.jpg

How to capture the request url of mp3 file on the audiobook website?

website:
https://www.ting22.com/ting/659-2.html
I'd like to get some audiobooks from the website above. In other words, I want to download the MP3 files of the audiobook from 659-2.html to 659-1724.html.
By using F12 tools, In [Network]->[Media], I can see the Request URL of MP3 file, But I don't know how to get the URL using a script.
Here are some specs of what I'm using:
System: Windows 7 x64
Python: 3.7.0
Update:
For example, by using F12 tool, I can see the file's url is "http://audio.xmcdn.com/group58/M03/8D/07/wKgLc1zNaabhA__WAEJyyPUT5k4509.mp3"
But I don't know how to get the URL of MP3 file in code ? Rather than how to download the file.
which library should I use?
Thank you.
UPDATE
Well that would be a bit more complicated because requests packages won't return the .mp3 source, so you need to use Selenium. Here is a tested solution:
from selenium import webdriver # pip install selenium
import urllib3
import shutil
import os
if not os.path.exists(os.getcwd()+'/mp3_folder'):
os.mkdir(os.getcwd()+'/mp3_folder')
def downloadFile(url=None):
filename = url.split('/')[-1]
c = urllib3.PoolManager()
with c.request('GET', url, preload_content=False) as resp, open('mp3_folder/'+filename, 'wb') as out_file:
shutil.copyfileobj(resp, out_file)
resp.release_conn()
driver = webdriver.Chrome('chromedriver.exe') # download chromedriver from here and place it near the script: https://chromedriver.storage.googleapis.com/72.0.3626.7/chromedriver_win32.zip
for i in range(2, 1725):
try:
driver.get('https://www.ting22.com/ting/659-%s.html' % i)
src = driver.find_element_by_id('mySource').get_attribute('src')
downloadFile(src)
print(src)
except Exception as exc:
print(exc)

Looping links in csv file using selenium in python

I am trying to open a .csv file, and open link in .csv file with selenium, and loop through links in .csv file. I am new to Selenium . I can easily do it in beautiful soup.Can you please guide me through right direction.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
contents =[]
filename = 'link_business_filter.csv'
def copy_json():
with open('vendors_info_bangkok.json',"a") as wt:
for x in script3:
wt.write(x)
wt.close()
return
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
browser = webdriver.Chrome('chromedriver')
for link_loop in url_html:
open = browser.get(link_loop)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
print(script3)
copy_json()
First install selenium:
pip install selenium
Then according to your os install chromediver then test it by going to folder you have kept the driver and open terminal and type chromedriver, if there's no error then it works.
Then in your code you need to provide executable_path for the chromdriver
In you Code:
....code...
for link in contents:
url_html = requests.get(link)
path to chromdriver = 'C:/Users/chromedriver.exe' #<-- you can keep this file anywhere you wish
browser = webdriver.Chrome(executable_path= 'path_to_chromdriver') #<-- you can also give the path directly here
for link_loop in url_html:
...code...

How to download file from a page using python

I am having troubles downloading txt file from this page: https://www.ceps.cz/en/all-data#RegulationEnergy (when you scroll down and see Download: txt, xls and xml).
My goal is to create scraper that will go to the linked page, clicks on the txt link for example and saves a downloaded file.
Main problems that I am not sure how to solve:
The file doesn't have a real link that I can call and download it, but the link is created with JS based on filters and file type.
When I use requests library for python and call the link with all headers it just redirects me to https://www.ceps.cz/en/all-data .
Approaches tried:
Using scraper such as ParseHub to download link didn't work as intended. But this scraper was the closest to what I've wanted to get.
Used requests library to connect to the link using headers that HXR request uses for downloading the file but it just redirects me to https://www.ceps.cz/en/all-data .
If you could propose some solution for this task, thank you in advance. :-)
You can download this data to a directory of your choice with Selenium; you just need to specify the directory to which the data will be saved. In what follows below, I'll save the txt data to my desktop:
from selenium import webdriver
download_dir = '/Users/doug/Desktop/'
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : download_dir}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://www.ceps.cz/en/all-data')
container = driver.find_element_by_class_name('download-graph-data')
button = container.find_element_by_tag_name('li')
button.click()
You should do like so:
import requests
txt_format = 'txt'
xls_format = 'xls' # open in binary mode
xml_format = 'xlm' # open in binary mode
def download(file_type):
url = f'https://www.ceps.cz/download-data/?format={txt_format}'
response = requests.get(url)
if file_type is txt_format:
with open(f'file.{file_type}', 'w') as file:
file.write(response.text)
else:
with open(f'file.{file_type}', 'wb') as file:
file.write(response.content)
download(txt_format)

How to open an HTML file in the browser from Python?

I am trying to open an HTML file from Python but my script just displays the contents of the HTML file in Python instead of opening it in the browser. How can I fix this problem? How can I open the HTML file in my Chrome browser?
testdata.html
<div>
<img src="https://plot.ly/~user001/2.png" alt="Success vs Failure" style="max-width: 100%;width: 600px;" width="600" onerror="this.onerror=null;this.src='https://plot.ly/404.png';" />
<script data-plotly="user001:2" src="https://plot.ly/embed.js" async></script>
</div>
Python 2.7 script:
import urllib
page = urllib.urlopen('testdata.html').read()
print page
Try specifying the "file://" at the start of the URL.
// Also, use the absolute path of the file:
webbrowser.open('file://' + os.path.realpath(filename))
Or
import webbrowser
new = 2 # open in a new tab, if possible
// open a public URL, in this case, the webbrowser docs
url = "http://docs.python.org/library/webbrowser.html"
webbrowser.open(url,new=new)
// open an HTML file on my own (Windows) computer
url = "file://d/testdata.html"
webbrowser.open(url,new=new)
import os
os.system("start [your's_url]")
Enjoy!
You can use webbrowser library:
import webbrowser
url = 'file:///path/to/your/file/testdata.html'
webbrowser.open(url, new=2) # open in new tab
Here's a way that doesn't require external libraries and that can work of local files as well.
import subprocess
import os
url = "https://stackoverflow.com"
# or a file on your computer
# url = "/Users/yourusername/Desktop/index.html
try: # should work on Windows
os.startfile(url)
except AttributeError:
try: # should work on MacOS and most linux versions
subprocess.call(['open', url])
except:
print('Could not open URL')
You can use Selenium.
download the latest chromedriver, paste the chromedriver.exe in "C:\Python27\Scripts".
then
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("your page path")
print driver.page_source.encode('utf-8')
driver.quit()
display.stop()
I feel this is the easiest solution:
import os
os.getcwd() #To check the current working directory or path
os.chdir("D:\\Folder Name\\") # D:\Folder Name\ is the new path where you want to save the converted dataframe(df) to .html file
import webbrowser
df.to_html("filename.html") #Converting dataframe df to html and saving with a name 'filename' and
webbrowser.get("C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s").open("file://" + os.path.realpath("filename.html"))
you can download latest version of "gecodriver" from here.then add gecodriver executable file to your project.then pip install selenium and below the code for windows:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import os
#optional
options = Options()
options.set_preference('permissions.default.image', 2)
options.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
#for windows
Driver = webdriver.Firefox(options=options, executable_path='geckodriver.exe')
Driver.implicitly_wait(15)
#path of your project -> reference : "https://stackoverflow.com/questions/25389095/python-get-path-of-root-project-structure/40227116"
Root = os.path.dirname(os.path.abspath(__file__))
driver.get('file://' + Root + 'path/to/htmlfile')
Hope I Helped You:)
import os
os.system('open "/Applications/Safari.app" '+ '"' + os.path.realpath(fname)+ '"')

Categories