Because some of the CSV files that I need to read are very large (multiple GB), I am trying to implement a progress bar that indicates the number of bytes read out of the total when reading a CSV file from a URL with pandas.
I am trying to implement something like this:
from tqdm import tqdm
import requests
from sodapy import Socrata
import contextlib
import urllib
import pandas as pd
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
response = requests.get(url, params=None, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('Content-Length', 0))
block_size = 1000
df = []
last_position = 0
cur_position = 1
with tqdm(desc=url, total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024
) as bar:
with contextlib.closing(urllib.request.urlopen(url=url)) as rd:
# Create TextFileReader
reader = pd.read_csv(rd, chunksize=block_size)
for chunk in reader:
df.append(chunk)
# Here I would like to calculate the current file position: cur_position
bar.update(cur_position - last_position)
last_position = cur_position
Is there a way to get the file position from the pandas TextFileReader somehow? Perhaps something equivalent to ftell in C++ for TextFileReader?
Not thoroughly tested, but you can implement custom class with read() method where you read from requests response line by line and update the tqdm bar:
import requests
import pandas as pd
from tqdm import tqdm
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
class TqdmReader:
def __init__(self, resp):
total_size = int(resp.headers.get("Content-Length", 0))
self.resp = resp
self.bar = tqdm(
desc=resp.url,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
)
self.reader = self.read_from_stream()
def read_from_stream(self):
for line in self.resp.iter_lines():
line += b"\n"
self.bar.update(len(line))
yield line
def read(self, n=0):
try:
return next(self.reader)
except StopIteration:
return ""
with requests.get(url, params=None, stream=True) as resp:
df = pd.read_csv(TqdmReader(resp))
print(len(df))
Prints:
https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no: 100%|██████████████████████████████████████████████████████████████████████████████| 2.09M/2.09M [00:00<00:00, 2.64MiB/s]
7975
I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).
I can find some doc explaining how to use tqdm package, but from which I can't figure out how to produce progress meter when downloading data online.
Below is an example code I copied from ResidentMario for downloading data
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filename
dat = download_file("https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD",
"NYPD Motor Vehicle Collisions.csv")
Could anyone show me how to use tqdm package here to show downloading progress?
Thanks
As of now i do something like that:
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
chunkSize = 1024
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
pbar.update (len(chunk))
f.write(chunk)
return filename
pbar.clear() and pbar.close()
Manually update the progress bar, useful for streams such as reading files.
https://github.com/tqdm/tqdm#returns
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm(unit="B", unit_scale=True, unit_divisor=1024, total=int( r.headers['Content-Length'] ))
pbar.clear() # clear 0% info
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
return filename
Thanks to silmaril, but the below works and makes more sense to me.
def download_file(url, filename):
r = requests.get(url, stream=True)
filelength = int(r.headers['Content-Length'])
with open(filename, 'wb') as f:
pbar = tqdm(total=int(filelength/1024))
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update ()
f.write(chunk)
I have a Python script that launches a URL that is a downloadable file. Is there some way to have Python display the download progress as oppose to launching the browser?
I've just written a super simple (slightly hacky) approach to this for scraping PDFs off a certain site. Note, it only works correctly on Unix based systems (Linux, mac os) as PowerShell does not handle "\r":
import sys
import requests
link = "http://indy/abcde1245"
file_name = "download.data"
with open(file_name, "wb") as f:
print("Downloading %s" % file_name)
response = requests.get(link, stream=True)
total_length = response.headers.get('content-length')
if total_length is None: # no content length header
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
It uses the requests library so you'll need to install that. This outputs something like the following into your console:
>Downloading download.data
>[============= ]
The progress bar is 52 characters wide in the script (2 characters are simply the [] so 50 characters of progress). Each = represents 2% of the download.
You can use the 'clint' package (written by the same author as 'requests') to add a simple progress bar to your downloads like this:
from clint.textui import progress
r = requests.get(url, stream=True)
path = '/some/path/for/file.txt'
with open(path, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
which will give you a dynamic output which will look like this:
[################################] 5210/5210 - 00:00:01
It should work on multiple platforms as well! You can also change the bar to dots or a spinner with .dots and .mill instead of .bar.
Enjoy!
Python 3 with TQDM
This is the suggested technique from the TQDM docs.
import urllib.request
from tqdm import tqdm
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
def download_url(url, output_path):
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
There is an answer with requests and tqdm.
import requests
from tqdm import tqdm
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
# Can also replace 'file' with a io.BytesIO object
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Gist: https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
Another good option is wget:
import wget
wget.download('http://download.geonames.org/export/zip/US.zip')
The output will look like this:
11% [........ ] 73728 / 633847
Source: https://medium.com/#petehouston/download-files-with-progress-in-python-96f14f6417a2
You can also use click. It has a good library for progress bar:
import click
with click.progressbar(length=total_size, label='Downloading files') as bar:
for file in files:
download(file)
bar.update(file.size)
Sorry for being late with an answer; just updated the tqdm docs:
https://github.com/tqdm/tqdm/#hooks-and-callbacks
Using urllib.urlretrieve and OOP:
import urllib
from tqdm.auto import tqdm
class TqdmUpTo(tqdm):
"""Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
def update_to(self, b=1, bsize=1, tsize=None):
"""
b : Blocks transferred so far
bsize : Size of each block
tsize : Total size
"""
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n) # will also set self.n = b * bsize
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
desc=eg_file) as t: # all optional kwargs
urllib.urlretrieve(
eg_link, filename=eg_file, reporthook=t.update_to, data=None)
t.total = t.n
or using requests.get and file wrappers:
import requests
from tqdm.auto import tqdm
eg_link = "https://github.com/tqdm/tqdm/releases/download/v4.46.0/tqdm-4.46.0-py2.py3-none-any.whl"
eg_file = eg_link.split('/')[-1]
response = requests.get(eg_link, stream=True)
with tqdm.wrapattr(open(eg_file, "wb"), "write", miniters=1,
total=int(response.headers.get('content-length', 0)),
desc=eg_file) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
You could of course mix & match techniques.
The tqdm package now includes a function designed to handle exactly this type of situation: wrapattr. You just wrap an object's read (or write) attribute, and tqdm handles the rest. Here's a simple download function that puts it all together with requests:
def download(url, filename):
import functools
import pathlib
import shutil
import requests
import tqdm
r = requests.get(url, stream=True, allow_redirects=True)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path
# Define Progress Bar function
def print_progressbar(total, current, barsize=60):
progress = int(current*barsize/total)
completed = str(int(current*100/total)) + '%'
print('[', chr(9608)*progress, ' ', completed, '.'*(barsize-progress), '] ', str(i)+'/'+str(total), sep='', end='\r', flush=True)
# Sample Code
total = 6000
barsize = 60
print_frequency = max(min(total//barsize, 100), 1)
print("Start Task..", flush=True)
for i in range(1, total+1):
if i%print_frequency == 0 or i == 1:
print_progressbar(total, i, barsize)
print("\nFinished", flush=True)
# Snapshot of Progress Bar :
Below lines are for illustrations only. In command prompt you will see single progress bar showing incremental progress.
[ 0%............................................................] 1/6000
[██████████ 16%..................................................] 1000/6000
[████████████████████ 33%........................................] 2000/6000
[██████████████████████████████ 50%..............................] 3000/6000
[████████████████████████████████████████ 66%....................] 4000/6000
[██████████████████████████████████████████████████ 83%..........] 5000/6000
[████████████████████████████████████████████████████████████ 100%] 6000/6000
Just some improvements of #rich-jones's answer
import re
import request
from clint.textui import progress
def get_filename(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0].replace('"', "")
def stream_download_file(url, output, chunk_size=1024, session=None, verbose=False):
if session:
file = session.get(url, stream=True)
else:
file = requests.get(url, stream=True)
file_name = get_filename(file.headers.get('content-disposition'))
filepath = "{}/{}".format(output, file_name)
if verbose:
print ("Downloading {}".format(file_name))
with open(filepath, 'wb') as f:
total_length = int(file.headers.get('content-length'))
for chunk in progress.bar(file.iter_content(chunk_size=chunk_size), expected_size=(total_length/chunk_size) + 1):
if chunk:
f.write(chunk)
f.flush()
if verbose:
print ("Finished")
I come up with a solution that looks a bit nicer based on tqdm. My implementation is based on the answer of #Endophage.
The effect:
# import the download_file definition from the next cell first.
>>> download_file(url, 'some_data.dat')
Downloading some_data.dat.
7%|█▎ | 195.31MB/2.82GB: [00:04<01:02, 49.61MB/s]
The implementation:
import time
import math
import requests
from tqdm import tqdm
def download_file(url, filename, update_interval=500, chunk_size=4096):
def memory2str(mem):
sizes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
power = int(math.log(mem, 1024))
size = sizes[power]
for _ in range(power):
mem /= 1024
if power > 0:
return f'{mem:.2f}{size}'
else:
return f'{mem}{size}'
with open(filename, 'wb') as f:
response = requests.get(url, stream=True)
total_length = response.headers.get('content-length')
if total_length is None:
f.write(response.content)
else:
print(f'Downloading {filename}.', flush=True)
downloaded, total_length = 0, int(total_length)
total_size = memory2str(total_length)
bar_format = '{percentage:3.0f}%|{bar:20}| {desc} [{elapsed}<{remaining}' \
'{postfix}]'
if update_interval * chunk_size * 100 >= total_length:
update_interval = 1
with tqdm(total=total_length, bar_format=bar_format) as bar:
counter = 0
now_time, now_size = time.time(), downloaded
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
downloaded += len(data)
counter += 1
bar.update(len(data))
if counter % update_interval == 0:
ellapsed = time.time() - now_time
runtime_downloaded = downloaded - now_size
now_time, now_size = time.time(), downloaded
cur_size = memory2str(downloaded)
speed_size = memory2str(runtime_downloaded / ellapsed)
bar.set_description(f'{cur_size}/{total_size}')
bar.set_postfix_str(f'{speed_size}/s')
counter = 0
Simple solution with wget and tqdm python libraries that shows progress in megabytes and remaining time:
MB: 37%|███▋ | 2044.8/5588.7 [02:57<04:30, 13.11it/s]
Install libraries pip3 install wget tqdm
Import libraries
import wget
from tqdm import tqdm
Wrapper class for tqdm
class ProgressBar:
def __init__(self):
self.progress_bar = None
def __call__(self, current_bytes, total_bytes, width):
current_mb = round(current_bytes / 1024 ** 2, 1)
total_mb = round(total_bytes / 1024 ** 2, 1)
if self.progress_bar is None:
self.progress_bar = tqdm(total=total_mb, desc="MB")
delta_mb = current_mb - self.progress_bar.n
self.progress_bar.update(delta_mb)
How to use it
wget.download(url, dst_filepath, ProgressBar())
Here is the "Goat Progress bar" implementation from George Hotz.
r = requests.get(url, stream=True)
progress_bar = tqdm(total=int(r.headers.get('content-length', 0)), unit='B', unit_scale=True, desc=url)
dat = b''.join(x for x in r.iter_content(chunk_size=16384) if progress_bar.update(len(x)) or True)
cc: https://github.com/geohot/tinygrad/commit/7118602c976d264d97af3c1c8b97d72077616d07
You can stream a downloads as it is here -> Stream a Download.
Also you can Stream Uploads.
The most important streaming a request is done unless you try to access the response.content
with just 2 lines
for line in r.iter_lines():
if line:
print(line)
Stream Requests