Python progress bar and downloads - python

I have a Python script that launches a URL that is a downloadable file. Is there some way to have Python display the download progress as oppose to launching the browser?

I've just written a super simple (slightly hacky) approach to this for scraping PDFs off a certain site. Note, it only works correctly on Unix based systems (Linux, mac os) as PowerShell does not handle "\r":
import sys
import requests
link = "http://indy/abcde1245"
file_name = "download.data"
with open(file_name, "wb") as f:
print("Downloading %s" % file_name)
response = requests.get(link, stream=True)
total_length = response.headers.get('content-length')
if total_length is None: # no content length header
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
It uses the requests library so you'll need to install that. This outputs something like the following into your console:
>Downloading download.data
>[=============                            ]
The progress bar is 52 characters wide in the script (2 characters are simply the [] so 50 characters of progress). Each = represents 2% of the download.

You can use the 'clint' package (written by the same author as 'requests') to add a simple progress bar to your downloads like this:
from clint.textui import progress
r = requests.get(url, stream=True)
path = '/some/path/for/file.txt'
with open(path, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
which will give you a dynamic output which will look like this:
[################################] 5210/5210 - 00:00:01
It should work on multiple platforms as well! You can also change the bar to dots or a spinner with .dots and .mill instead of .bar.
Enjoy!

Python 3 with TQDM
This is the suggested technique from the TQDM docs.
import urllib.request
from tqdm import tqdm
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
def download_url(url, output_path):
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

There is an answer with requests and tqdm.
import requests
from tqdm import tqdm
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
# Can also replace 'file' with a io.BytesIO object
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Gist: https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51

Another good option is wget:
import wget
wget.download('http://download.geonames.org/export/zip/US.zip')
The output will look like this:
11% [........ ] 73728 / 633847
Source: https://medium.com/#petehouston/download-files-with-progress-in-python-96f14f6417a2

You can also use click. It has a good library for progress bar:
import click
with click.progressbar(length=total_size, label='Downloading files') as bar:
for file in files:
download(file)
bar.update(file.size)

Sorry for being late with an answer; just updated the tqdm docs:
https://github.com/tqdm/tqdm/#hooks-and-callbacks
Using urllib.urlretrieve and OOP:
import urllib
from tqdm.auto import tqdm
class TqdmUpTo(tqdm):
"""Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
def update_to(self, b=1, bsize=1, tsize=None):
"""
b : Blocks transferred so far
bsize : Size of each block
tsize : Total size
"""
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n) # will also set self.n = b * bsize
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
desc=eg_file) as t: # all optional kwargs
urllib.urlretrieve(
eg_link, filename=eg_file, reporthook=t.update_to, data=None)
t.total = t.n
or using requests.get and file wrappers:
import requests
from tqdm.auto import tqdm
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
response = requests.get(eg_link, stream=True)
with tqdm.wrapattr(open(eg_file, "wb"), "write", miniters=1,
total=int(response.headers.get('content-length', 0)),
desc=eg_file) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
You could of course mix & match techniques.

The tqdm package now includes a function designed to handle exactly this type of situation: wrapattr. You just wrap an object's read (or write) attribute, and tqdm handles the rest. Here's a simple download function that puts it all together with requests:
def download(url, filename):
import functools
import pathlib
import shutil
import requests
import tqdm
r = requests.get(url, stream=True, allow_redirects=True)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path

# Define Progress Bar function
def print_progressbar(total, current, barsize=60):
progress = int(current*barsize/total)
completed = str(int(current*100/total)) + '%'
print('[', chr(9608)*progress, ' ', completed, '.'*(barsize-progress), '] ', str(i)+'/'+str(total), sep='', end='\r', flush=True)
# Sample Code
total = 6000
barsize = 60
print_frequency = max(min(total//barsize, 100), 1)
print("Start Task..", flush=True)
for i in range(1, total+1):
if i%print_frequency == 0 or i == 1:
print_progressbar(total, i, barsize)
print("\nFinished", flush=True)
# Snapshot of Progress Bar :
Below lines are for illustrations only. In command prompt you will see single progress bar showing incremental progress.
[ 0%............................................................] 1/6000
[██████████ 16%..................................................] 1000/6000
[████████████████████ 33%........................................] 2000/6000
[██████████████████████████████ 50%..............................] 3000/6000
[████████████████████████████████████████ 66%....................] 4000/6000
[██████████████████████████████████████████████████ 83%..........] 5000/6000
[████████████████████████████████████████████████████████████ 100%] 6000/6000

Just some improvements of #rich-jones's answer
import re
import request
from clint.textui import progress
def get_filename(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0].replace('"', "")
def stream_download_file(url, output, chunk_size=1024, session=None, verbose=False):
if session:
file = session.get(url, stream=True)
else:
file = requests.get(url, stream=True)
file_name = get_filename(file.headers.get('content-disposition'))
filepath = "{}/{}".format(output, file_name)
if verbose:
print ("Downloading {}".format(file_name))
with open(filepath, 'wb') as f:
total_length = int(file.headers.get('content-length'))
for chunk in progress.bar(file.iter_content(chunk_size=chunk_size), expected_size=(total_length/chunk_size) + 1):
if chunk:
f.write(chunk)
f.flush()
if verbose:
print ("Finished")

I come up with a solution that looks a bit nicer based on tqdm. My implementation is based on the answer of #Endophage.
The effect:
# import the download_file definition from the next cell first.
>>> download_file(url, 'some_data.dat')
Downloading some_data.dat.
7%|█▎ | 195.31MB/2.82GB: [00:04<01:02, 49.61MB/s]
The implementation:
import time
import math
import requests
from tqdm import tqdm
def download_file(url, filename, update_interval=500, chunk_size=4096):
def memory2str(mem):
sizes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
power = int(math.log(mem, 1024))
size = sizes[power]
for _ in range(power):
mem /= 1024
if power > 0:
return f'{mem:.2f}{size}'
else:
return f'{mem}{size}'
with open(filename, 'wb') as f:
response = requests.get(url, stream=True)
total_length = response.headers.get('content-length')
if total_length is None:
f.write(response.content)
else:
print(f'Downloading {filename}.', flush=True)
downloaded, total_length = 0, int(total_length)
total_size = memory2str(total_length)
bar_format = '{percentage:3.0f}%|{bar:20}| {desc} [{elapsed}<{remaining}' \
'{postfix}]'
if update_interval * chunk_size * 100 >= total_length:
update_interval = 1
with tqdm(total=total_length, bar_format=bar_format) as bar:
counter = 0
now_time, now_size = time.time(), downloaded
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
downloaded += len(data)
counter += 1
bar.update(len(data))
if counter % update_interval == 0:
ellapsed = time.time() - now_time
runtime_downloaded = downloaded - now_size
now_time, now_size = time.time(), downloaded
cur_size = memory2str(downloaded)
speed_size = memory2str(runtime_downloaded / ellapsed)
bar.set_description(f'{cur_size}/{total_size}')
bar.set_postfix_str(f'{speed_size}/s')
counter = 0

Simple solution with wget and tqdm python libraries that shows progress in megabytes and remaining time:
MB: 37%|███▋ | 2044.8/5588.7 [02:57<04:30, 13.11it/s]
Install libraries pip3 install wget tqdm
Import libraries
import wget
from tqdm import tqdm
Wrapper class for tqdm
class ProgressBar:
def __init__(self):
self.progress_bar = None
def __call__(self, current_bytes, total_bytes, width):
current_mb = round(current_bytes / 1024 ** 2, 1)
total_mb = round(total_bytes / 1024 ** 2, 1)
if self.progress_bar is None:
self.progress_bar = tqdm(total=total_mb, desc="MB")
delta_mb = current_mb - self.progress_bar.n
self.progress_bar.update(delta_mb)
How to use it
wget.download(url, dst_filepath, ProgressBar())

Here is the "Goat Progress bar" implementation from George Hotz.
r = requests.get(url, stream=True)
progress_bar = tqdm(total=int(r.headers.get('content-length', 0)), unit='B', unit_scale=True, desc=url)
dat = b''.join(x for x in r.iter_content(chunk_size=16384) if progress_bar.update(len(x)) or True)
cc: https://github.com/geohot/tinygrad/commit/7118602c976d264d97af3c1c8b97d72077616d07

You can stream a downloads as it is here -> Stream a Download.
Also you can Stream Uploads.
The most important streaming a request is done unless you try to access the response.content
with just 2 lines
for line in r.iter_lines():
if line:
print(line)
Stream Requests

Related

Progress in bytes when reading CSV from URL with pandas

Because some of the CSV files that I need to read are very large (multiple GB), I am trying to implement a progress bar that indicates the number of bytes read out of the total when reading a CSV file from a URL with pandas.
I am trying to implement something like this:
from tqdm import tqdm
import requests
from sodapy import Socrata
import contextlib
import urllib
import pandas as pd
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
response = requests.get(url, params=None, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('Content-Length', 0))
block_size = 1000
df = []
last_position = 0
cur_position = 1
with tqdm(desc=url, total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024
) as bar:
with contextlib.closing(urllib.request.urlopen(url=url)) as rd:
# Create TextFileReader
reader = pd.read_csv(rd, chunksize=block_size)
for chunk in reader:
df.append(chunk)
# Here I would like to calculate the current file position: cur_position
bar.update(cur_position - last_position)
last_position = cur_position
Is there a way to get the file position from the pandas TextFileReader somehow? Perhaps something equivalent to ftell in C++ for TextFileReader?
Not thoroughly tested, but you can implement custom class with read() method where you read from requests response line by line and update the tqdm bar:
import requests
import pandas as pd
from tqdm import tqdm
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
class TqdmReader:
def __init__(self, resp):
total_size = int(resp.headers.get("Content-Length", 0))
self.resp = resp
self.bar = tqdm(
desc=resp.url,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
)
self.reader = self.read_from_stream()
def read_from_stream(self):
for line in self.resp.iter_lines():
line += b"\n"
self.bar.update(len(line))
yield line
def read(self, n=0):
try:
return next(self.reader)
except StopIteration:
return ""
with requests.get(url, params=None, stream=True) as resp:
df = pd.read_csv(TqdmReader(resp))
print(len(df))
Prints:
https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no: 100%|██████████████████████████████████████████████████████████████████████████████| 2.09M/2.09M [00:00<00:00, 2.64MiB/s]
7975

Requests + Tqdm to a variable

I got this function right here from this question (ctrl+ f for "There is an answer with requests and tqdm"):
import requests
from tqdm import tqdm
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Basically it downloads a file and writes it to a file, and I wanted it to return a variable that rapresents the downloaded file, so I made this:
def download(url: str, fname: str):
import requests
from tqdm import tqdm
import os
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
with open(fname, "rb") as f:
returned = f.read()
os.remove(fname)
return returned
Now it saves the file, reads it and saves it to a variable, deletes the file and returns the variable.
Is there a way I can save it directly to a variable?
Well, you could just return a tqdm iterator, and do whatever you like with the chunks:
import requests
import tqdm
import io
def download(url: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with tqdm.tqdm(
desc=url,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in resp.iter_content(chunk_size=65536):
bar.update(len(chunk))
yield chunk
bio = io.BytesIO()
for chunk in download('http://...'):
# Do something with the chunk; this just stores it in memory.
bio.write(chunk)
content = bio.getvalue() # Get the contents of the BytesIO() as a bytes.
Of course you can then refactor this to
import requests
import tqdm
import io
def download_as_bytes_with_progress(url: str) -> bytes:
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
bio = io.BytesIO()
with tqdm.tqdm(
desc=url,
total=total,
unit='b',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in resp.iter_content(chunk_size=65536):
bar.update(len(chunk))
bio.write(chunk)
return bio.getvalue()

Using TQDM almost doubles the file size of my GET request

I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).

Python (3.5) - urllib.request.urlopen - Progress Bar Available?

I'm trying to search the world wide web for this answer, but I feel there answer may be no. I'm using Python 3.5 and a library called urllib.request with a method called urllib.request.urlopen(url) to open a link and download a file.
It would be nice to have some kind of measure of progress for this, as the file(s) are over 200MB. I'm looking at the API here, and don't see any kind of parameter with a hook.
Here's my code:
downloadURL = results[3] #got this from code earlier up
rel_path = account + '/' + eventID + '_' + title + '.mp4'
filename_abs_path = os.path.join(script_dir, rel_path)
print('>>> Downloading >>> ' + title)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
Can anyone provide insight if they think I can potentially have a progress bar or would the only way be to use a different library? I'm looking to keep the code quite short and simple, I just need some indication of the file being downloaded. Thanks for any help provided!
If the response includes a content-length you can read the incoming data in blocks and calculate percent done. Unfortunately, web servers that "chunk" responses don't always provide a content length, so it doesn't always work. Here is an example to test.
import urllib.request
import sys
import io
try:
url = sys.argv[1]
except IndexError:
print("usage: test.py url")
exit(2)
resp = urllib.request.urlopen(url)
length = resp.getheader('content-length')
if length:
length = int(length)
blocksize = max(4096, length//100)
else:
blocksize = 1000000 # just made something up
print(length, blocksize)
buf = io.BytesIO()
size = 0
while True:
buf1 = resp.read(blocksize)
if not buf1:
break
buf.write(buf1)
size += len(buf1)
if length:
print('{:.2f}\r done'.format(size/length), end='')
print()
#tdelaney's answer is great, but in Python 3.8 you have to use getvalue() method instead of read():
import io, urllib.request
with urllib.request.urlopen(Url) as Response:
Length = Response.getheader('content-length')
BlockSize = 1000000 # default value
if Length:
Length = int(Length)
BlockSize = max(4096, Length // 20)
print("UrlLib len, blocksize: ", Length, BlockSize)
BufferAll = io.BytesIO()
Size = 0
while True:
BufferNow = Response.read(BlockSize)
if not BufferNow:
break
BufferAll.write(BufferNow)
Size += len(BufferNow)
if Length:
Percent = int((Size / Length)*100)
print(f"download: {Percent}% {Url}")
print("Buffer All len:", len(BufferAll.getvalue()))

PycURL RESUME_FROM

I can't seem to get the RESUME_FROM option to work. Here's some example code that I have been testing with:
import os
import pycurl
import sys
def progress(total, existing, upload_t, upload_d):
try:
frac = float(existing)/float(total)
except:
frac = 0
sys.stdout.write("\r%s %3i%%" % ("file", frac*100) )
url = "http://launchpad.net/keryx/stable/0.92/+download/keryx_0.92.4.tar.gz"
filename = url.split("/")[-1].strip()
def test(debug_type, debug_msg):
print "debug(%d): %s" % (debug_type, debug_msg)
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
# Setup writing
if os.path.exists(filename):
f = open(filename, "ab")
c.setopt(pycurl.RESUME_FROM, os.path.getsize(filename))
else:
f = open(filename, "wb")
c.setopt(pycurl.WRITEDATA, f)
#c.setopt(pycurl.VERBOSE, 1)
c.setopt(pycurl.DEBUGFUNCTION, test)
c.setopt(pycurl.NOPROGRESS, 0)
c.setopt(pycurl.PROGRESSFUNCTION, progress)
c.perform()
It was actually resuming properly, however it appeared to be starting from the beginning again because the length from os.path.getsize(filename) was not added to existing in the progress function. Just a minor mistake! :)

Categories