I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).
I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)
sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.
from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()
I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)
You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.
A mp3 is accessible via two different URLs. I'm trying to use Python to figure out which URL is fastest to download from...?
For example, I want to time how long https://cpx.podbean.com/mf/download/a6bxxa/LAF_15min_044_mindfulness.mp3 takes to download and compare that to how long http://cpx.podbean.com/mf/play/a6bxxa/LAF_15min_044_mindfulness.mp3 takes to download.
To download the mp3 I'm currently using:
urllib.request.urlretrieve(mp3_url, mp3_filename)
you could essentially do something like:
from datetime import datetime
starttime = datetime.now()
urllib.request.urlretrieve(mp3_url, mp3_filename) # Whatever code you're using...
finishtime = datetime.now()
runtime = finishtime - starttime
print str(runtime)
this will print a timestamp like 0:03:19.356798 in the format of [hours]:[minutes]:[seconds.micro seconds]
My bad... i didn't realize you're trying to figure out which link was the fastest. I have no clue how you're storing the your mp3_url and mp3_filename elements, but try something like this (adjust accordingly):
from datetime import datetime
mp3_list = {
'file1.mp3': 'http://www.url1.com',
'file2.mp3': 'http://www.url2.com',
'file3.mp3': 'http://www.url3.com',
}
runtimes = []
for mp3_url, mp3_filename in mp3_list.items(): # i'm not sure how or where you are storing mp3_url or mp3_filename, so you'll have to modify this line accordingly...
starttime = datetime.now()
urllib.request.urlretrieve(mp3_url, mp3_filename) # Whatever code you're using...
finishtime = datetime.now()
runtime = finishtime - starttime
runtimes.append({'runtime': runtime, 'url': mp3_url, 'filename': mp3_filename})
fastest_mp3_url = sorted(runtimes, key=lambda k: k['runtime'])[0]['url']
fastest_mp3_filename = sorted(runtimes, key=lambda k: k['runtime'])[0]['filename']
print fastest_mp3_url
print fastest_mp3_filename
It's simple there are plenty of methods to do so (python3x)
using win64pyinstaller with progress
from win64pyinstaller import install
install("your_url", "destination_folder_with_file_name")
using urllib3 with progress
modifying [PabloG's] solution which is in python 2x
How to download a file over HTTP?
import urllib3
from sys import stdout
from urllib.request import urlopen
def _restart_line():
stdout.write('\r')
stdout.flush()
url = "your_url"
file_name = url.split('/')[-1]
u = urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.get("Content-Length"))
print(f"Downloading: {file_name} Bytes: {file_size}")
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = f"done - {(file_size_dl/1000000):.2f}, {(file_size_dl * 100 / file_size):.2f} %"
status = status + chr(8)*(len(status)+1)
stdout.write(status)
stdout.flush()
_restart_line()
f.close()
there are more ways to do it, hope you got your answer thankyou!
I have a Python script that launches a URL that is a downloadable file. Is there some way to have Python display the download progress as oppose to launching the browser?
I've just written a super simple (slightly hacky) approach to this for scraping PDFs off a certain site. Note, it only works correctly on Unix based systems (Linux, mac os) as PowerShell does not handle "\r":
import sys
import requests
link = "http://indy/abcde1245"
file_name = "download.data"
with open(file_name, "wb") as f:
print("Downloading %s" % file_name)
response = requests.get(link, stream=True)
total_length = response.headers.get('content-length')
if total_length is None: # no content length header
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
It uses the requests library so you'll need to install that. This outputs something like the following into your console:
>Downloading download.data
>[============= ]
The progress bar is 52 characters wide in the script (2 characters are simply the [] so 50 characters of progress). Each = represents 2% of the download.
You can use the 'clint' package (written by the same author as 'requests') to add a simple progress bar to your downloads like this:
from clint.textui import progress
r = requests.get(url, stream=True)
path = '/some/path/for/file.txt'
with open(path, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
which will give you a dynamic output which will look like this:
[################################] 5210/5210 - 00:00:01
It should work on multiple platforms as well! You can also change the bar to dots or a spinner with .dots and .mill instead of .bar.
Enjoy!
Python 3 with TQDM
This is the suggested technique from the TQDM docs.
import urllib.request
from tqdm import tqdm
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
def download_url(url, output_path):
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
There is an answer with requests and tqdm.
import requests
from tqdm import tqdm
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
# Can also replace 'file' with a io.BytesIO object
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Gist: https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
Another good option is wget:
import wget
wget.download('http://download.geonames.org/export/zip/US.zip')
The output will look like this:
11% [........ ] 73728 / 633847
Source: https://medium.com/#petehouston/download-files-with-progress-in-python-96f14f6417a2
You can also use click. It has a good library for progress bar:
import click
with click.progressbar(length=total_size, label='Downloading files') as bar:
for file in files:
download(file)
bar.update(file.size)
Sorry for being late with an answer; just updated the tqdm docs:
https://github.com/tqdm/tqdm/#hooks-and-callbacks
Using urllib.urlretrieve and OOP:
import urllib
from tqdm.auto import tqdm
class TqdmUpTo(tqdm):
"""Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
def update_to(self, b=1, bsize=1, tsize=None):
"""
b : Blocks transferred so far
bsize : Size of each block
tsize : Total size
"""
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n) # will also set self.n = b * bsize
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
desc=eg_file) as t: # all optional kwargs
urllib.urlretrieve(
eg_link, filename=eg_file, reporthook=t.update_to, data=None)
t.total = t.n
or using requests.get and file wrappers:
import requests
from tqdm.auto import tqdm
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
response = requests.get(eg_link, stream=True)
with tqdm.wrapattr(open(eg_file, "wb"), "write", miniters=1,
total=int(response.headers.get('content-length', 0)),
desc=eg_file) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
You could of course mix & match techniques.
The tqdm package now includes a function designed to handle exactly this type of situation: wrapattr. You just wrap an object's read (or write) attribute, and tqdm handles the rest. Here's a simple download function that puts it all together with requests:
def download(url, filename):
import functools
import pathlib
import shutil
import requests
import tqdm
r = requests.get(url, stream=True, allow_redirects=True)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path
# Define Progress Bar function
def print_progressbar(total, current, barsize=60):
progress = int(current*barsize/total)
completed = str(int(current*100/total)) + '%'
print('[', chr(9608)*progress, ' ', completed, '.'*(barsize-progress), '] ', str(i)+'/'+str(total), sep='', end='\r', flush=True)
# Sample Code
total = 6000
barsize = 60
print_frequency = max(min(total//barsize, 100), 1)
print("Start Task..", flush=True)
for i in range(1, total+1):
if i%print_frequency == 0 or i == 1:
print_progressbar(total, i, barsize)
print("\nFinished", flush=True)
# Snapshot of Progress Bar :
Below lines are for illustrations only. In command prompt you will see single progress bar showing incremental progress.
[ 0%............................................................] 1/6000
[██████████ 16%..................................................] 1000/6000
[████████████████████ 33%........................................] 2000/6000
[██████████████████████████████ 50%..............................] 3000/6000
[████████████████████████████████████████ 66%....................] 4000/6000
[██████████████████████████████████████████████████ 83%..........] 5000/6000
[████████████████████████████████████████████████████████████ 100%] 6000/6000
Just some improvements of #rich-jones's answer
import re
import request
from clint.textui import progress
def get_filename(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0].replace('"', "")
def stream_download_file(url, output, chunk_size=1024, session=None, verbose=False):
if session:
file = session.get(url, stream=True)
else:
file = requests.get(url, stream=True)
file_name = get_filename(file.headers.get('content-disposition'))
filepath = "{}/{}".format(output, file_name)
if verbose:
print ("Downloading {}".format(file_name))
with open(filepath, 'wb') as f:
total_length = int(file.headers.get('content-length'))
for chunk in progress.bar(file.iter_content(chunk_size=chunk_size), expected_size=(total_length/chunk_size) + 1):
if chunk:
f.write(chunk)
f.flush()
if verbose:
print ("Finished")
I come up with a solution that looks a bit nicer based on tqdm. My implementation is based on the answer of #Endophage.
The effect:
# import the download_file definition from the next cell first.
>>> download_file(url, 'some_data.dat')
Downloading some_data.dat.
7%|█▎ | 195.31MB/2.82GB: [00:04<01:02, 49.61MB/s]
The implementation:
import time
import math
import requests
from tqdm import tqdm
def download_file(url, filename, update_interval=500, chunk_size=4096):
def memory2str(mem):
sizes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
power = int(math.log(mem, 1024))
size = sizes[power]
for _ in range(power):
mem /= 1024
if power > 0:
return f'{mem:.2f}{size}'
else:
return f'{mem}{size}'
with open(filename, 'wb') as f:
response = requests.get(url, stream=True)
total_length = response.headers.get('content-length')
if total_length is None:
f.write(response.content)
else:
print(f'Downloading {filename}.', flush=True)
downloaded, total_length = 0, int(total_length)
total_size = memory2str(total_length)
bar_format = '{percentage:3.0f}%|{bar:20}| {desc} [{elapsed}<{remaining}' \
'{postfix}]'
if update_interval * chunk_size * 100 >= total_length:
update_interval = 1
with tqdm(total=total_length, bar_format=bar_format) as bar:
counter = 0
now_time, now_size = time.time(), downloaded
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
downloaded += len(data)
counter += 1
bar.update(len(data))
if counter % update_interval == 0:
ellapsed = time.time() - now_time
runtime_downloaded = downloaded - now_size
now_time, now_size = time.time(), downloaded
cur_size = memory2str(downloaded)
speed_size = memory2str(runtime_downloaded / ellapsed)
bar.set_description(f'{cur_size}/{total_size}')
bar.set_postfix_str(f'{speed_size}/s')
counter = 0
Simple solution with wget and tqdm python libraries that shows progress in megabytes and remaining time:
MB: 37%|███▋ | 2044.8/5588.7 [02:57<04:30, 13.11it/s]
Install libraries pip3 install wget tqdm
Import libraries
import wget
from tqdm import tqdm
Wrapper class for tqdm
class ProgressBar:
def __init__(self):
self.progress_bar = None
def __call__(self, current_bytes, total_bytes, width):
current_mb = round(current_bytes / 1024 ** 2, 1)
total_mb = round(total_bytes / 1024 ** 2, 1)
if self.progress_bar is None:
self.progress_bar = tqdm(total=total_mb, desc="MB")
delta_mb = current_mb - self.progress_bar.n
self.progress_bar.update(delta_mb)
How to use it
wget.download(url, dst_filepath, ProgressBar())
Here is the "Goat Progress bar" implementation from George Hotz.
r = requests.get(url, stream=True)
progress_bar = tqdm(total=int(r.headers.get('content-length', 0)), unit='B', unit_scale=True, desc=url)
dat = b''.join(x for x in r.iter_content(chunk_size=16384) if progress_bar.update(len(x)) or True)
cc: https://github.com/geohot/tinygrad/commit/7118602c976d264d97af3c1c8b97d72077616d07
You can stream a downloads as it is here -> Stream a Download.
Also you can Stream Uploads.
The most important streaming a request is done unless you try to access the response.content
with just 2 lines
for line in r.iter_lines():
if line:
print(line)
Stream Requests