machine learning/data mining for pattern data finding - python

A have some problems in my audio network (I work with IP audio) - from time to time I have a short gaps in my audio stream. I have a logger which record all the streams. I've write small script with python and ffmpeg (and borrowed some JavaScript for visualizing :)) to find the gaps in logger mp3 files. It's better than nothing but I have a lot of false detections - it's quite annoying to to check all the results manually - script finds from 20 to 200 gaps per hour and usually only 1-10 gaps caused by some fault - all others are short term low audio level in songs, speech, etc. I'm looking for high level machine learning/data mining mechanism to check the gaps automatically to leave only what I want. I can provide a lot of "true" gaps (array with data) and "false" gaps to teach the machine and after that want just to feed it with data stamp with gap inside to compare does it look like a "true" gap or not. What can you recommend for the fastest solution? Note that Python is the only thing I can write a little. :/ At this moment the code of gap searching is following. It finds gaps with duration more than gap_min ms and less than gap_max ms in mp3 file or folder with files.
import numpy as np
import subprocess, os, sys
import ntpath
tolerance=150#100
gap_min=0.007#0.021
gap_max=0.035#0.03
sample_rate=48000
gap_file_duration=3#duration of the output mp3 files with gaps
ffmpeg_path=r'/Applications/ffmpeg'
temp_folder=r'/Users/user/Downloads/'
result_folder=r'/Users/user/Downloads/tmp/'
target_LUFS=-9#in LUFS
def samples_to_timecode(samples):
return '{0:02d}:{1:02d}:{2:02d}.{3:02d}'.format(int(samples / (3600*sample_rate)),
int(samples / (60*sample_rate) % 60),
int(samples / sample_rate % 60),
int(samples % sample_rate))
def timecode_to_samples(timecode):
return sum(f * int(t) for f,t in zip((3600*sample_rate, 60*sample_rate, sample_rate, 1), timecode.split(':')))
def seconds_to_timecode(seconds):
return '{0:02d}:{1:02d}:{2:03f}'.format(int(seconds / (3600)),
int(seconds / (60) % 60),
seconds % 60)#,
#int(seconds % 1000 % 60))
def analyze_bin_file(source_file):
print('Analizing start...')
data = np.memmap(source_file, dtype='h', mode='r')
zero_indexes=np.where(np.logical_and(data>=-tolerance, data<=tolerance))
gap_start=None
gaps_array=[]
for i in range(len(zero_indexes[0])-1):
if zero_indexes[0][i+1]-zero_indexes[0][i] == 1:
if not gap_start: gap_start=i
else:
if gap_start:
if ((zero_indexes[0][i]-zero_indexes[0][gap_start]) >= (gap_min*sample_rate)) and ((zero_indexes[0][i]-zero_indexes[0][gap_start]) <= (gap_max*sample_rate)):
gaps_array.append([float(zero_indexes[0][gap_start])/sample_rate,
float(zero_indexes[0][i])/sample_rate,
samples_to_timecode(zero_indexes[0][gap_start]),
round(float(zero_indexes[0][i]-zero_indexes[0][gap_start])/sample_rate,3)])
print('Gaps found: %s' % len(gaps_array))
gap_start=None
os.remove(source_file)#For some reasons it works badly in Windows. Just comment this line if cause problem. But you should delete temporary bin files manually after that.
print('Analizing done!')
return gaps_array
def execute_cmd(cmd):
p = subprocess.Popen(cmd , shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
return out.rstrip(), err.rstrip(), p.returncode
def prepare_bin_file(source_file):
print('Start preparing binary file...')
result_file_path=temp_folder+ntpath.basename(source_file)+'.bin'
result=execute_cmd('{0} -i {1} -ar {4} -af volume={3} -ac 1 -map 0:a -c:a pcm_s16le -y -f data {2}'.format(ffmpeg_path,
source_file,
result_file_path,
volume,
sample_rate))
if result[2] == 0:
print('Preparing done!')
return result_file_path
else:
print('Error occures while preparing!')
def cut_gaps(mp3_file,gaps_array):
print('Cutting file {0} start...'.format(mp3_file))
result_files=[]
path_list = mp3_file.split(os.sep)
for gap in range(len(gaps_array)):
gap_start=seconds_to_timecode(gaps_array[gap][0]-float(gap_file_duration)/2)
gap_duration=gap_file_duration+gaps_array[gap][3]
result=execute_cmd('{0} -y -i {1} -ss {2} -t {3} -c:a copy {4}'.format(ffmpeg_path,
mp3_file,
gap_start,
gap_duration,
result_folder+path_list[-2]+os.sep+'mp3'+os.sep+ntpath.basename(mp3_file)+'.{0:03d}'.format(gap)+'.mp3'))
#print('Save bin data file {0} of {1} {2}'.format(gap+1, len(gaps_array), 'OK' if (result_bin[-1] == 0) else 'ERROR'))
#print(result_bin)
result_files.append(ntpath.basename(mp3_file)+'.{0:03d}'.format(gap)+'.mp3')
print('Cutting file {0} of {1} {2}'.format(gap+1, len(gaps_array), 'OK' if (result[-1] == 0) else 'ERROR'))
print('Cutting done!')
return result_files
def make_report(source_file, gaps_array, cut_files):
path_list = source_file.split(os.sep)
report=open(result_folder+path_list[-2]+os.sep+ntpath.basename(source_file)+'.html','w')
report.write('<!doctype html><html lang=""><head></head><html><body><script src="https://cdnjs.cloudflare.com/ajax/libs/wavesurfer.js/1.1.2/wavesurfer.min.js"></script>')
report.write('<div>File {0} analizing report<br>'.format(source_file))
report.write('Searching parameters:<br>Gap minimum {0} second<br>Gap maximum {1} second<br>Tolerance value {2}<br>Analyze volume {3} dB<hr><hr></div>'.format(gap_min,
gap_max,
tolerance,
volume))
if len(gaps_array) > 0:
for gap_no in range(len(gaps_array)):
report.write('<div>Gap No {0}<br>Gap start {1}<br>Gap duration {2}ms</div>'.format(gap_no,
gaps_array[gap_no][2],
gaps_array[gap_no][3]*1000))
html="""
<div id='waveform""" + str(gap_no) + """'></div>
<div style='text-align: center'>
<button class='btn btn-primary' onclick='wavesurfer""" + str(gap_no) + """.playPause()'>
<i class='glyphicon glyphicon-play'></i>
Play
</button>
<p class='row'>
<div class='col-xs-1'>
<i class='glyphicon glyphicon-zoom-in'></i>
</div>
<div class='col-xs-10'>
<input id='slider""" + str(gap_no) + """' type='range' min='1' max='4000' value='1' style='width: 100%' />
</div>
<div class='col-xs-1'>
<i class='glyphicon glyphicon-zoom-out'></i>
</div>
</p>
</div>
"""
report.write(html)
script="""
<script>
var wavesurfer""" + str(gap_no) + """ = WaveSurfer.create({
container: '#waveform""" + str(gap_no) + """',
waveColor: 'red',
progressColor: 'purple'
});
wavesurfer""" + str(gap_no) + """.load('./mp3/""" + cut_files[gap_no] + """');
var slider""" + str(gap_no) + """ = document.querySelector('#slider""" + str(gap_no) + """');
slider""" + str(gap_no) + """.oninput = function () {
var zoomLevel = Number(slider""" + str(gap_no) + """.value);
wavesurfer""" + str(gap_no) + """.zoom(zoomLevel);
};
</script>
"""
report.write(script)
else:
report.write('<div>No gaps found!</div>')
report.write('</body></html>')
report.close()
def normalize_file(source):
print('Analizing integrated loudness...')
result = execute_cmd('{0} -nostats -i {1} -filter_complex ebur128 -f null -'.format(ffmpeg_path,
source))
if result[-1] == 0:
summary_index=str(result[1][-255:]).rfind('Summary:')
summary_list=str(result[1][-255:][summary_index:]).split()
I_LUFS = float(summary_list[summary_list.index('I:') + 1])
gainLog = -(I_LUFS - target_LUFS)
volume = 10 ** (gainLog / 20)
print('Analizing complete. I= {0} LUFS. Volume change value={1}.'.format(I_LUFS, volume))
else:
print('Error!')
return volume
def run(source):
if os.path.isfile(source) or os.path.isdir(source):
path_list = source.split(os.sep)
if not os.path.isdir(result_folder+path_list[-2]):
os.makedirs(result_folder+path_list[-2])
if not os.path.isdir(result_folder+path_list[-2]+os.sep+'mp3'):
os.makedirs(result_folder+path_list[-2]+os.sep+'mp3')
else:
print('Error! File of folder {0} not found!'.format(source))
if os.path.isfile(source):
global volume
volume=normalize_file(source)
bin_file=prepare_bin_file(source)
gaps_array=analyze_bin_file(bin_file)
if len(gaps_array):
cut_files=cut_gaps(source, gaps_array)
make_report(source, gaps_array, cut_files)
else:
make_report(source, gaps_array, cut_files=[])
elif os.path.isdir(source):
for file in os.listdir(source):
if file.endswith(".mp3"):
print(source ,file)
run(source+os.sep+file)
src=r'/Users/user/Downloads/2016-08-02'
if len(sys.argv) > 1:
run(sys.argv[1])
else:
run(src)
The result is HTML file with waveforms. The result works properly only in Firefox browser.
False gaps:
Example of the false gap 1
True gaps:
Example of the true gap 1
UPDATE. Because the algorithm is very sensitive to the volume level, I've added volume normalization before analyzing data. It doesn't apply to the output files - it's just normalize data before it being analyzed.

Related

How to parse and search a list of inter-dependent strings?

Not sure how to word the title but I have this list:
Sink Input #1535
Driver: protocol-native.c
Owner Module: 10
Client: 21932
Sink: 0
Sample Specification: s16le 2ch 44100Hz
Channel Map: front-left,front-right
Format: pcm, format.sample_format = "\"s16le\"" format.rate = "44100" format.channels = "2" format.channel_map = "\"front-left,front-right\""
Corked: no
Mute: no
Volume: front-left: 32768 / 50% / -18.06 dB, front-right: 32768 / 50% / -18.06 dB
balance 0.00
Buffer Latency: 0 usec
Sink Latency: 23084 usec
Resample method: n/a
Properties:
media.name = "Simple DirectMedia Layer"
application.name = "ffplay"
with a whole bunch of other stuff following.
First I need to match on Input Sink# and record the following digits until end of line. Then I have to search on application.name = and record the program name that follows in quotes. Then the search has to repeat for multiple sinks and program names. Later I plan to return all Input Sink numbers for a given application name.
Current method uses brute force and high system resources. Is there a better method than this:
def sink_list(prog,func):
''' Return list of Firefox or ffplay input sinks indices
'''
indices = []
result = os.popen('pactl list short sink-inputs') \
.read().strip().splitlines()
# TODO: We could be doing one os.popen and grabbing all sinks at once
if len(result) == 0:
print('sink_list() found no input sinks at all.' \
' Called by: '+func)
return indices
for line in result:
sink = line.split('\t')[0]
app = os.popen('pactl list sink-inputs | grep "Sink Input #' + \
sink + '" -A20 | grep application.name').read()
# print("Searching for:",prog," in:",app," using input sink#:",sink)
if prog in app:
indices.append(sink)
# print('indices',prog,':',indices)
if len(indices) == 0:
print("sink_list() found no input sink for: '" + prog + \
"' called by: "+func)
return indices
# print("Found Input Sinks:", indices)
return indices
Reply to comments
Input was requested:
''' Get old PID's and Input Sinks before ffplay '''
old_pid = pid_list( "ffplay", "play_start()" )
old_sink = sink_list( "ffplay", "play_start()" )
self.have_ffplay_input_sink = False # Each ffplay can have diff #
# Launch ffplay in the background. CANNOT query result, it stops bkgrnd
os.popen('ffplay -autoexit ' + '"' + self.current_song_path + '"' \
+ ' -nodisp 2>' + TMP_CURR_SONG + ' &')
''' Get New PID's and Input Sinks for ffplay '''
# Give time for `ffplay` to create pulseaudio sink.
root.after(100) # THIS IS UGLY, root.after is machine dependent!!!
if not self.top2_is_active: return # Play window closed?
new_pid = pid_list("ffplay", "play_start()")
new_sink = sink_list("ffplay", "play_start()")
self.top2_ffplay_pid = list_diff(new_pid, old_pid, "play_start()")
self.top2_ffplay_sink = list_diff(new_sink, old_sink, "play_start()")
I'll answer my own question in case it helps others.
This is the function I wrote which returns original requirements plus current volume:
def sink_master():
all_lines = []
all_lines = os.popen('pactl list sink-inputs').read().splitlines()
all_sinks = []
in_sink = False
in_volume = False
for line in all_lines:
if in_sink is False and "Sink Input #" in line:
this_sink = line.split('#')[1]
in_sink = True
continue
if in_sink is True and in_volume is False and "Volume:" in line:
this_volume = line.split('/')[1]
this_volume = this_volume.replace(' ','')
this_volume = this_volume.replace('%','')
in_volume = True
continue
if in_sink is True and in_volume is True and "tion.name =" in line:
this_name = line.split('=')[1]
this_name = this_name.replace(' ','')
this_name = this_name.replace('"','')
in_sink = False
in_volume = False
all_sinks.append(tuple((this_sink,this_volume,this_name)))
continue
print(all_sinks)
return all_sinks
When you run it it returns a list of tuples:
[('1828', '100', 'Firefox'), ('1891', '50', 'ffplay'), ('1907', '100', 'ffplay')]
Each tuple contains:
Input Sink # used by pulseaudio (respected by ffplay)
Current volume (with spaces and % stripped)
Application name (with double quotes " stripped)

Define a function to generate html pages in python

trying to create several web pages that contains tables from the read csv file and I tried to define a function to create html web pages rather than writing the same codes many times.
read my csv file:
infile = open("new.csv", "r")
data = []
for line in infile:
cols = line.split(",")
Oposition = cols[0]
Winner = cols[1]
Margin = cols[2]
Ground = cols[3]
Year = cols[4]
pair = (Oposition, Winner, Margin, Ground, Year)
data.append(pair)
infile.close()
so far my codes are:
page = """<!DOCTYPE html>
<html>
<head>
<title>abc</title>
<style>
h1 {
text-align: center;
}
body {
background-image: url("2014.png");
background-repeat: no-repeat;
background-position: right top;
background-attachment: fixed;
}
</style>
</head>
<body>
<h1>{{heading}}</h1>
{{paragraph}}
<p>Back to main page</p>
<table border="1"><tr><th>Oposition</th><th>Winner</th><th>Margin</th><th>Ground</th><th>Year</th></tr>
"""
out1=page.format(heading = "2012 Stats", paragraph = "<p>aaaaaaa</p>")
out2 = page.format(heading = "2013 Stats", paragraph = "<p>bbbbb</P>")
out3 = page.format(heading = "2014 Stats", paragraph = "<p>cccc</P>")
out4 = page.format(heading = "2015 Stats", paragraph = "<p>ddddCSK</p>")
def write_html_file(filename, body):
out = open(filename, "w")
out.write(body)
write_html_file("2012.html",out1)
write_html_file("2013.html",out2)
write_html_file("2014.html",out3)
write_html_file("2015.html",out4)
for r in data:
if ''.join(r[4].split()) == "2012":
Oposition = r[0]
Winner = r[1]
Margin = r[2]
Ground=r[3]
Year = r[4]
out1.write("<tr> <td>" + Oposition+ '</td><td> '+ Winner+'</td><td> '+Margin+'</td><td> '+Ground+' </td><td>'+ Year+ " </td></tr>")
elif ''.join(r[4].split()) == "2013":
Oposition = r[0]
Winner = r[1]
Margin = r[2]
Ground=r[3]
Year = r[4]
out2.write("<tr> <td>" + Oposition+ '</td><td> '+ Winner+'</td><td> '+Margin+'</td><td> '+Ground+' </td><td>'+ Year+ " </td></tr>")
elif ''.join(r[4].split()) == "2014":
Oposition = r[0]
Winner = r[1]
Margin = r[2]
Ground=r[3]
Year = r[4]
out3.write("<tr> <td>" + Oposition+ '</td><td> '+ Winner+'</td><td> '+Margin+'</td><td> '+Ground+' </td><td>'+ Year+ " </td></tr>")
elif ''.join(r[4].split()) == "2015":
Oposition = r[0]
Winner = r[1]
Margin = r[2]
Ground=r[3]
Year = r[4]
out4.write("<tr> <td>" + Oposition+ '</td><td> '+ Winner+'</td><td> '+Margin+'</td><td> '+Ground+' </td><td>'+ Year+ " </td></tr>")
def output(a):
a.write("</table> </body>\n")
a.write("</html>\n")
a.close()
output(out1)
output(out2)
output(out3)
output(out4)
im trying to make tables according to years 2012, 2013, 2014, 2015 and make html pages that contains each of them. just cant figure out.
Any help or other option? Much appreciated
I get an error message saying:
---> 25 page1 = page.format(heading = "2012 Stats", paragraph = "<p>aaaa</p>")
KeyError: '\n text-align'
Short Answer
Python's built-in string format syntax is a variable enclosed by single paramthesis. That's why '\n text-align' is regarded as a key whereas it was intended to be a css style.
Solution
You can go ahead to escape your css snippets but I would not recommend it, because the html text will not be readable and will be difficult to maintain.
Hence, please use a template engine to help. There are many template engines: Jinja2, Mako.. etc. Since I am more familiar with the first one, let me show you how to get page1 working::
from jinja2 import Environment
env = Environment()
page_template = env.from_string(page)
page1 = page_template(heading="2012 Stats", paragraph="<p>aaaaaaa</p>")
And you will need to install jinja2:
$ pip install jinja2
Alternative solution
You can use my library pyexcel and pyexcel-text to get a html table rendered for you. The sample code is:
import pyexcel as p
sheet = p.get_sheet(file_name='new.csv')
sheet.colnames = ['Oposition', 'Winner', 'Margin', 'Ground', 'Year']
sheet.name = "2012 Stats"
print(sheet.html)
To run above code, you need to install these two additional packages:
$ pip install pyexcel pyexcel-text

How to aggregate values over a bigger than RAM gzip'ed csv file?

For starters I am new to bioinformatics and especially to programming, but I have built a script that will go through a so-called VCF file (only the individuals are included, one clumn = one individual), and uses a search string to find out for every variant (line) whether the individual is homozygous or heterozygous.
This script works, at least on small subsets, but I know it stores everything in memory. I would like to do this on very large zipped files (of even whole genomes), but I do not know how to transform this script into a script that does everything line by line (because I want to count through whole columns I just do not see how to solve that).
So the output is 5 things per individual (total variants, number homozygote, number heterozygote, and proportions of homo- and heterozygotes). See the code below:
#!usr/bin/env python
import re
import gzip
subset_cols = 'subset_cols_chr18.vcf.gz'
#nuc_div = 'nuc_div_chr18.txt'
gz_infile = gzip.GzipFile(subset_cols, "r")
#gz_outfile = gzip.GzipFile(nuc_div, "w")
# make a dictionary of the header line for easy retrieval of elements later on
headers = gz_infile.readline().rstrip().split('\t')
print headers
column_dict = {}
for header in headers:
column_dict[header] = []
for line in gz_infile:
columns = line.rstrip().split('\t')
for i in range(len(columns)):
c_header=headers[i]
column_dict[c_header].append(columns[i])
#print column_dict
for key in column_dict:
number_homozygotes = 0
number_heterozygotes = 0
for values in column_dict[key]:
SearchStr = '(\d)/(\d):\d+,\d+:\d+:\d+:\d+,\d+,\d+'
#this search string contains the regexp (this regexp was tested)
Result = re.search(SearchStr,values)
if Result:
#here, it will skip the missing genoytypes ./.
variant_one = int(Result.group(1))
variant_two = int(Result.group(2))
if variant_one == 0 and variant_two == 0:
continue
elif variant_one == variant_two:
#count +1 in case variant one and two are equal (so 0/0, 1/1, etc.)
number_homozygotes += 1
elif variant_one != variant_two:
#count +1 in case variant one is not equal to variant two (so 1/0, 0/1, etc.)
number_heterozygotes += 1
print "%s homozygotes %s" % (number_homozygotes, key)
print "%s heterozygotes %s" % (number_heterozygotes,key)
variants = number_homozygotes + number_heterozygotes
print "%s variants" % variants
prop_homozygotes = (1.0*number_homozygotes/variants)*100
prop_heterozygotes = (1.0*number_heterozygotes/variants)*100
print "%s %% homozygous %s" % (prop_homozygotes, key)
print "%s %% heterozygous %s" % (prop_heterozygotes, key)
Any help will be much appreciated so I can go on investigating large datasets,
thank you :)
The VCF file by the way looks something like this:
INDIVIDUAL_1 INDIVIDUAL_2 INDIVIDUAL_3
0/0:9,0:9:24:0,24,221 1/0:5,4:9:25:25,0,26 1/1:0,13:13:33:347,33,0
This is then the header line with the individual ID names (I have in total 33 individuals with more complicated ID tags, I simplified here) and then I have a lot of these information lines with the same specific pattern. I am only interested in the first part with the slash so hence the regular epxression.
Disclosure: I work full-time on the Hail project.
Hi there! Welcome to programming and bioinformatics!
amirouche correctly identifies that you need some sort of "streaming" or
"line-by-line" algorithm to handle data that is too large to fit in the RAM of
your machine. Unfortunately, if you are limited to python without libraries, you
have to manually chunk the file and handle parsing of a VCF.
The Hail project is a free, open-source tool for scientists
with genetic data too big to fit in RAM all the way up to too big to fit on one
machine (i.e. tens of terabytes of compressed VCF data). Hail can take advantage
of all the cores on one machine or all the cores on a cloud of machines. Hail
runs on Mac OS X and most flavors of GNU/Linux. Hail exposes a statistical
genetics domain-specific language which makes your question much shorter to
express.
The Simplest Answer
The most faithful translation of your python code to Hail is this:
/path/to/hail importvcf -f YOUR_FILE.vcf.gz \
annotatesamples expr -c \
'sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()'
annotatesamples expr -c \
'sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled' \
exportsamples -c 'sample = s, sa.*' -o sampleInfo.tsv
I ran the above command on my dual-core laptop on a 2.0GB file:
# ls -alh profile225.vcf.bgz
-rw-r--r-- 1 dking 1594166068 2.0G Aug 25 15:43 profile225.vcf.bgz
# ../hail/build/install/hail/bin/hail importvcf -f profile225.vcf.bgz \
annotatesamples expr -c \
'sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()' \
annotatesamples expr -c \
'sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled' \
exportsamples -c 'sample = s, sa.*' -o sampleInfo.tsv
hail: info: running: importvcf -f profile225.vcf.bgz
[Stage 0:=======================================================> (63 + 2) / 65]hail: info: Coerced sorted dataset
hail: info: running: annotatesamples expr -c 'sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()'
[Stage 1:========================================================>(64 + 1) / 65]hail: info: running: annotatesamples expr -c 'sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled'
hail: info: running: exportsamples -c 'sample = s, sa.*' -o sampleInfo.tsv
hail: info: while importing:
file:/Users/dking/projects/hail-data/profile225.vcf.bgz import clean
hail: info: timing:
importvcf: 34.211s
annotatesamples expr: 6m52.4s
annotatesamples expr: 21.399ms
exportsamples: 121.786ms
total: 7m26.8s
# head sampleInfo.tsv
sample pHomRef pHet nHom nHet nCalled
HG00096 9.49219e-01 5.07815e-02 212325 11359 223684
HG00097 9.28419e-01 7.15807e-02 214035 16502 230537
HG00099 9.27182e-01 7.28184e-02 211619 16620 228239
HG00100 9.19605e-01 8.03948e-02 214554 18757 233311
HG00101 9.28714e-01 7.12865e-02 214283 16448 230731
HG00102 9.24274e-01 7.57260e-02 212095 17377 229472
HG00103 9.36543e-01 6.34566e-02 209944 14225 224169
HG00105 9.29944e-01 7.00564e-02 214153 16133 230286
HG00106 9.25831e-01 7.41687e-02 213805 17128 230933
Wow! Seven minutes for 2GB, that's slow! Unfortunately, this is because VCFs
aren't a great format for data analysis!
Optimizing the Storage Format
Let's convert to Hail's optimized storage format, a VDS, and re-run the command:
# ../hail/build/install/hail/bin/hail importvcf -f profile225.vcf.bgz write -o profile225.vds
hail: info: running: importvcf -f profile225.vcf.bgz
[Stage 0:========================================================>(64 + 1) / 65]hail: info: Coerced sorted dataset
hail: info: running: write -o profile225.vds
[Stage 1:> (0 + 4) / 65]
[Stage 1:========================================================>(64 + 1) / 65]
# ../hail/build/install/hail/bin/hail read -i profile225.vds \
annotatesamples expr -c \
'sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()' \
annotatesamples expr -c \
'sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled' \
exportsamples -c 'sample = s, sa.*' -o sampleInfo.tsv
hail: info: running: read -i profile225.vds
[Stage 1:> (0 + 0) / 4]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
[Stage 1:============================================> (3 + 1) / 4]hail: info: running: annotatesamples expr -c 'sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()'
[Stage 2:========================================================>(64 + 1) / 65]hail: info: running: annotatesamples expr -c 'sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled'
hail: info: running: exportsamples -c 'sample = s, sa.*' -o sampleInfo.tsv
hail: info: timing:
read: 2.969s
annotatesamples expr: 1m20.4s
annotatesamples expr: 21.868ms
exportsamples: 151.829ms
total: 1m23.5s
About five times faster! With regard to larger scale, running the same command on the Google cloud on a VDS representing the full VCF the 1000 Genomes Project (2535 whole genomes, about 315GB compressed) took 3m42s using 328 worker cores.
Using a Hail Built-in
Hail also has a sampleqc command which computes most of what you want (and
more!):
../hail/build/install/hail/bin/hail read -i profile225.vds \
sampleqc \
annotatesamples expr -c \
'sa.myqc.pHomRef = (sa.qc.nHomRef + sa.qc.nHomVar) / sa.qc.nCalled,
sa.myqc.pHet= sa.qc.nHet / sa.qc.nCalled' \
exportsamples -c 'sample = s, sa.myqc.*, nHom = sa.qc.nHomRef + sa.qc.nHomVar, nHet = sa.qc.nHet, nCalled = sa.qc.nCalled' -o sampleInfo.tsv
hail: info: running: read -i profile225.vds
[Stage 0:> (0 + 0) / 4]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
[Stage 1:============================================> (3 + 1) / 4]hail: info: running: sampleqc
[Stage 2:========================================================>(64 + 1) / 65]hail: info: running: annotatesamples expr -c 'sa.myqc.pHomRef = (sa.qc.nHomRef + sa.qc.nHomVar) / sa.qc.nCalled,
sa.myqc.pHet= sa.qc.nHet / sa.qc.nCalled'
hail: info: running: exportsamples -c 'sample = s, sa.myqc.*, nHom = sa.qc.nHomRef + sa.qc.nHomVar, nHet = sa.qc.nHet, nCalled = sa.qc.nCalled' -o sampleInfo.tsv
hail: info: timing:
read: 2.928s
sampleqc: 1m27.0s
annotatesamples expr: 229.653ms
exportsamples: 353.942ms
total: 1m30.5s
Installing Hail
Installing Hail is pretty easy and we have docs to help
you. Need more help? You can get
real-time support in the Hail users chat room or, if you prefer forums, the Hail
discourse (both are linked to from the home page, unfortunately I don't have
enough reputation to create real links).
The Near Future
In the very near future (less than one month from today), the Hail team will
complete a Python API which will allow you to express the first snippet as:
result = importvcf("YOUR_FILE.vcf.gz")
.annotatesamples('sa.nCalled = gs.filter(g => g.isCalled).count(),
sa.nHom = gs.filter(g => g.isHomRef || g.isHomVar).count(),
sa.nHet = gs.filter(g => g.isHet).count()')
.annotatesamples('sa.pHom = sa.nHom / sa.nCalled,
sa.pHet = sa.nHet / sa.nCalled')
for (x in result.sampleannotations):
print("Sample " + x +
" nCalled " + x.nCalled +
" nHom " + x.nHom +
" nHet " + x.nHet +
" percent Hom " + x.pHom * 100 +
" percent Het " + x.pHet * 100)
result.sampleannotations.write("sampleInfo.tsv")
EDIT: Added the output of head on the tsv file.
EDIT2: Latest Hail doesn't need biallelic for sampleqc
EDIT3: Note about scaling to the cloud with hundreds of cores
To be able to process a bigger than RAM dataset you need to rework your algorithm to process the data line by line right now you are processing every column.
But before that you need a way to stream the rows from the gzip'ed file.
The following Python 3 code does that:
"""https://stackoverflow.com/a/40548567/140837"""
#!/usr/bin/env python3
import zlib
from mmap import PAGESIZE
CHUNKSIZE = PAGESIZE
# This is a generator that yields *decompressed* chunks from
# a gzip file. This is also called a stream or lazy list.
# It's done like so to avoid to have the whole file into memory
# Read more about Python generators to understand how it works.
# cf. `yield` keyword.
def gzip_to_chunks(filename):
decompressor = zlib.decompressobj(zlib.MAX_WBITS + 16)
with open(filename, 'rb') as f:
chunk = f.read(CHUNKSIZE)
while chunk:
out = decompressor.decompress(chunk)
yield out
chunk = f.read(CHUNKSIZE)
out = decompressor.flush()
yield out
# Again the following is a generator (see the `yield` keyword).
# What id does is iterate over an *iterable* of strings and yields
# rows from the file
# (hint: `gzip_to_chunks(filename)` returns a generator of strings)
# (hint: a generator is also an iterable)
# You can verify that by calling `chunks_to_rows` with a list of
# strings, where every strings is a chunk of the VCF file.
# (hint: a list is also an iterable)
# inline doc follows
def chunks_to_rows(chunks):
row = b'' # we will add the chars making a single row to this variable
for chunk in chunks: # iterate over the strings/chuncks yielded by gzip_to_chunks
for char in chunk: # iterate over all chars from the string
if char == b'\n'[0]: # hey! this is the end of the row!
yield row.decode('utf8').split('\t') # the row is complete, yield!
row = b'' # start a new row
else:
row += int.to_bytes(char, 1, byteorder='big') # Otherwise we are in the middle of the row
# at this point the program has read all the chunk
# at this point the program has read all the file without loading it fully in memory at once
# That said, there's maybe still something in row
if row:
yield row.decode('utf-8').split('\t') # yield the very last row if any
for e in chunks_to_rows(gzip_to_chunks('conceptnet-assertions-5.6.0.csv.gz')):
uid, relation, start, end, metadata = e
print(start, relation, end)
EDIT: rework the answer and make it work on concetpnet's tsv file that is gziped

How to make progress bar for a function in python [duplicate]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 4 years ago.
Improve this question
I wrote a simple console app to upload and download files from an FTP server using the ftplib.
I would like the app to show some visualization of its download/upload progress for the user; each time a data chunk is downloaded, I would like it to provide a progress update, even if it's just a numeric representation like a percentage.
Importantly, I want to avoid erasing all the text that's been printed to the console in previous lines (i.e. I don't want to "clear" the entire terminal while printing the updated progress).
This seems a fairly common task – how can I go about making a progress bar or similar visualization that outputs to my console while preserving prior program output?
Python 3
A Simple, Customizable Progress Bar
Here's an aggregate of many of the answers below that I use regularly (no imports required).
Note: All code in this answer was created for Python 3; see end of answer to use this code with Python 2.
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
"""
Call in a loop to create terminal progress bar
#params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
# Print New Line on Complete
if iteration == total:
print()
Sample Usage
import time
# A List of Items
items = list(range(0, 57))
l = len(items)
# Initial call to print 0% progress
printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
for i, item in enumerate(items):
# Do stuff...
time.sleep(0.1)
# Update Progress Bar
printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
Sample Output
Progress: |█████████████████████████████████████████████-----| 90.0% Complete
Update
There was discussion in the comments regarding an option that allows the progress bar to adjust dynamically to the terminal window width. While I don't recommend this, here's a gist that implements this feature (and notes the caveats).
Single-Call Version of The Above
A comment below referenced a nice answer posted in response to a similar question. I liked the ease of use it demonstrated and wrote a similar one, but opted to leave out the import of the sys module while adding in some of the features of the original printProgressBar function above.
Some benefits of this approach over the original function above include the elimination of an initial call to the function to print the progress bar at 0% and the use of enumerate becoming optional (i.e. it is no longer explicitly required to make the function work).
def progressBar(iterable, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
"""
Call in a loop to create terminal progress bar
#params:
iterable - Required : iterable object (Iterable)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
total = len(iterable)
# Progress Bar Printing Function
def printProgressBar (iteration):
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
# Initial Call
printProgressBar(0)
# Update Progress Bar
for i, item in enumerate(iterable):
yield item
printProgressBar(i + 1)
# Print New Line on Complete
print()
Sample Usage
import time
# A List of Items
items = list(range(0, 57))
# A Nicer, Single-Call Usage
for item in progressBar(items, prefix = 'Progress:', suffix = 'Complete', length = 50):
# Do stuff...
time.sleep(0.1)
Sample Output
Progress: |█████████████████████████████████████████████-----| 90.0% Complete
Python 2
To use the above functions in Python 2, set the encoding to UTF-8 at the top of your script:
# -*- coding: utf-8 -*-
And replace the Python 3 string formatting in this line:
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
With Python 2 string formatting:
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
Writing '\r' will move the cursor back to the beginning of the line.
This displays a percentage counter:
import time
import sys
for i in range(100):
time.sleep(1)
sys.stdout.write("\r%d%%" % i)
sys.stdout.flush()
tqdm: add a progress meter to your loops in a second:
>>> import time
>>> from tqdm import tqdm
>>> for i in tqdm(range(100)):
... time.sleep(1)
...
|###-------| 35/100 35% [elapsed: 00:35 left: 01:05, 1.00 iters/sec]
Write a \r to the console. That is a "carriage return" which causes all text after it to be echoed at the beginning of the line. Something like:
def update_progress(progress):
print '\r[{0}] {1}%'.format('#'*(progress/10), progress)
which will give you something like: [ ########## ] 100%
It is less than 10 lines of code.
The gist here: https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
import sys
def progress(count, total, suffix=''):
bar_len = 60
filled_len = int(round(bar_len * count / float(total)))
percents = round(100.0 * count / float(total), 1)
bar = '=' * filled_len + '-' * (bar_len - filled_len)
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
sys.stdout.flush() # As suggested by Rom Ruben
Try the click library written by the Mozart of Python, Armin Ronacher.
$ pip install click # both 2 and 3 compatible
To create a simple progress bar:
import click
with click.progressbar(range(1000000)) as bar:
for i in bar:
pass
This is what it looks like:
# [###-------------------------------] 9% 00:01:14
Customize to your hearts content:
import click, sys
with click.progressbar(range(100000), file=sys.stderr, show_pos=True, width=70, bar_template='(_(_)=%(bar)sD(_(_| %(info)s', fill_char='=', empty_char=' ') as bar:
for i in bar:
pass
Custom look:
(_(_)===================================D(_(_| 100000/100000 00:00:02
There are even more options, see the API docs:
click.progressbar(iterable=None, length=None, label=None, show_eta=True, show_percent=None, show_pos=False, item_show_func=None, fill_char='#', empty_char='-', bar_template='%(label)s [%(bar)s] %(info)s', info_sep=' ', width=36, file=None, color=None)
I realize I'm late to the game, but here's a slightly Yum-style (Red Hat) one I wrote (not going for 100% accuracy here, but if you're using a progress bar for that level of accuracy, then you're WRONG anyway):
import sys
def cli_progress_test(end_val, bar_length=20):
for i in xrange(0, end_val):
percent = float(i) / end_val
hashes = '#' * int(round(percent * bar_length))
spaces = ' ' * (bar_length - len(hashes))
sys.stdout.write("\rPercent: [{0}] {1}%".format(hashes + spaces, int(round(percent * 100))))
sys.stdout.flush()
Should produce something looking like this:
Percent: [############## ] 69%
... where the brackets stay stationary and only the hashes increase.
This might work better as a decorator. For another day...
Check this library: clint
it has a lot of features including a progress bar:
from time import sleep
from random import random
from clint.textui import progress
if __name__ == '__main__':
for i in progress.bar(range(100)):
sleep(random() * 0.2)
for i in progress.dots(range(100)):
sleep(random() * 0.2)
this link provides a quick overview of its features
Here's a nice example of a progressbar written in Python: http://nadiana.com/animated-terminal-progress-bar-in-python
But if you want to write it yourself. You could use the curses module to make things easier :)
[edit]
Perhaps easier is not the word for curses. But if you want to create a full-blown cui than curses takes care of a lot of stuff for you.
[edit]
Since the old link is dead I have put up my own version of a Python Progressbar, get it here: https://github.com/WoLpH/python-progressbar
import time,sys
for i in range(100+1):
time.sleep(0.1)
sys.stdout.write(('='*i)+(''*(100-i))+("\r [ %d"%i+"% ] "))
sys.stdout.flush()
output
[ 29% ] ===================
Install tqdm.(pip install tqdm)
and use it as follows:
import time
from tqdm import tqdm
for i in tqdm(range(1000)):
time.sleep(0.01)
That's a 10 seconds progress bar that'll output something like this:
47%|██████████████████▊ | 470/1000 [00:04<00:05, 98.61it/s]
and, just to add to the pile, here's an object you can use:
Add the following to a new file progressbar.py
import sys
class ProgressBar(object):
CHAR_ON = '='
CHAR_OFF = ' '
def __init__(self, end=100, length=65):
self._end = end
self._length = length
self._chars = None
self._value = 0
#property
def value(self):
return self._value
#value.setter
def value(self, value):
self._value = max(0, min(value, self._end))
if self._chars != (c := int(self._length * (self._value / self._end))):
self._chars = c
sys.stdout.write("\r {:3n}% [{}{}]".format(
int((self._value / self._end) * 100.0),
self.CHAR_ON * int(self._chars),
self.CHAR_OFF * int(self._length - self._chars),
))
sys.stdout.flush()
def __enter__(self):
self.value = 0
return self
def __exit__(self, *args, **kwargs):
sys.stdout.write('\n')
Can be included in your program with:
import time
from progressbar import ProgressBar
count = 150
print("starting things:")
with ProgressBar(count) as bar:
for i in range(count + 1):
bar.value += 1
time.sleep(0.01)
print("done")
Results in:
starting things:
100% [=================================================================]
done
This may be "over the top", but is handy when used frequently.
Run this at the Python command line (not in any IDE or development environment):
>>> import threading
>>> for i in range(50+1):
... threading._sleep(0.5)
... print "\r%3d" % i, ('='*i)+('-'*(50-i)),
Works fine on my Windows system.
Try to install this package: pip install progressbar2 :
import time
import progressbar
for i in progressbar.progressbar(range(100)):
time.sleep(0.02)
progresssbar github: https://github.com/WoLpH/python-progressbar
http://code.activestate.com/recipes/168639-progress-bar-class/ (2002)
http://code.activestate.com/recipes/299207-console-text-progress-indicator-class/ (2004)
http://pypi.python.org/pypi/progressbar (2006)
And a lot of tutorials waiting to be googled.
based on the above answers and other similar questions about CLI progress bar, I think I got a general common answer to all of them. Check it at https://stackoverflow.com/a/15860757/2254146
In summary, the code is this:
import time, sys
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
barLength = 10 # Modify this to change the length of the progress bar
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength*progress))
text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
sys.stdout.write(text)
sys.stdout.flush()
Looks like
Percent: [##########] 99.0%
I am using progress from reddit. I like it because it can print progress for every item in one line, and it shouldn't erase printouts from the program.
Edit: fixed link
A very simple solution is to put this code into your loop:
Put this in the body (i.e. top) of your file:
import sys
Put this in the body of your loop:
sys.stdout.write("-") # prints a dash for each iteration of loop
sys.stdout.flush() # ensures bar is displayed incrementally
I recommend using tqdm - https://pypi.python.org/pypi/tqdm - which makes it simple to turn any iterable or process into a progress bar, and handles all messing about with terminals needed.
From the documentation: "tqdm can easily support callbacks/hooks and manual updates. Here’s an example with urllib"
import urllib
from tqdm import tqdm
def my_hook(t):
"""
Wraps tqdm instance. Don't forget to close() or __exit__()
the tqdm instance once you're done with it (easiest using `with` syntax).
Example
-------
>>> with tqdm(...) as t:
... reporthook = my_hook(t)
... urllib.urlretrieve(..., reporthook=reporthook)
"""
last_b = [0]
def inner(b=1, bsize=1, tsize=None):
"""
b : int, optional
Number of blocks just transferred [default: 1].
bsize : int, optional
Size of each block (in tqdm units) [default: 1].
tsize : int, optional
Total size (in tqdm units). If [default: None] remains unchanged.
"""
if tsize is not None:
t.total = tsize
t.update((b - last_b[0]) * bsize)
last_b[0] = b
return inner
eg_link = 'http://www.doc.ic.ac.uk/~cod11/matryoshka.zip'
with tqdm(unit='B', unit_scale=True, miniters=1,
desc=eg_link.split('/')[-1]) as t: # all optional kwargs
urllib.urlretrieve(eg_link, filename='/dev/null',
reporthook=my_hook(t), data=None)
import sys
def progresssbar():
for i in range(100):
time.sleep(1)
sys.stdout.write("%i\r" % i)
progressbar()
NOTE: if you run this in interactive interepter you get extra numbers printed out
lol i just wrote a whole thingy for this
heres the code keep in mind you cant use unicode when doing block ascii i use cp437
import os
import time
def load(left_side, right_side, length, time):
x = 0
y = ""
print "\r"
while x < length:
space = length - len(y)
space = " " * space
z = left + y + space + right
print "\r", z,
y += "█"
time.sleep(time)
x += 1
cls()
and you call it like so
print "loading something awesome"
load("|", "|", 10, .01)
so it looks like this
loading something awesome
|█████ |
With the great advices above I work out the progress bar.
However I would like to point out some shortcomings
Every time the progress bar is flushed, it will start on a new line
print('\r[{0}]{1}%'.format('#' * progress* 10, progress))
like this:
[] 0%
[#]10%
[##]20%
[###]30%
2.The square bracket ']' and the percent number on the right side shift right as the '###' get longer.
3. An error will occur if the expression 'progress / 10' can not return an integer.
And the following code will fix the problem above.
def update_progress(progress, total):
print('\r[{0:10}]{1:>2}%'.format('#' * int(progress * 10 /total), progress), end='')
For python 3:
def progress_bar(current_value, total):
increments = 50
percentual = ((current_value/ total) * 100)
i = int(percentual // (100 / increments ))
text = "\r[{0: <{1}}] {2}%".format('=' * i, increments, percentual)
print(text, end="\n" if percentual == 100 else "")
function from Greenstick for 2.7:
def printProgressBar (iteration, total, prefix = '', suffix = '',decimals = 1, length = 100, fill = '#'):
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print'\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix),
sys.stdout.flush()
# Print New Line on Complete
if iteration == total:
print()
Code for python terminal progress bar
import sys
import time
max_length = 5
at_length = max_length
empty = "-"
used = "%"
bar = empty * max_length
for i in range(0, max_length):
at_length -= 1
#setting empty and full spots
bar = used * i
bar = bar+empty * at_length
#\r is carriage return(sets cursor position in terminal to start of line)
#\0 character escape
sys.stdout.write("[{}]\0\r".format(bar))
sys.stdout.flush()
#do your stuff here instead of time.sleep
time.sleep(1)
sys.stdout.write("\n")
sys.stdout.flush()
The python module progressbar is a nice choice.
Here is my typical code:
import time
import progressbar
widgets = [
' ', progressbar.Percentage(),
' ', progressbar.SimpleProgress(format='(%(value_s)s of %(max_value_s)s)'),
' ', progressbar.Bar('>', fill='.'),
' ', progressbar.ETA(format_finished='- %(seconds)s -', format='ETA: %(seconds)s', ),
' - ', progressbar.DynamicMessage('loss'),
' - ', progressbar.DynamicMessage('error'),
' '
]
bar = progressbar.ProgressBar(redirect_stdout=True, widgets=widgets)
bar.start(100)
for i in range(100):
time.sleep(0.1)
bar.update(i + 1, loss=i / 100., error=i)
bar.finish()
i wrote a simple progressbar:
def bar(total, current, length=10, prefix="", filler="#", space=" ", oncomp="", border="[]", suffix=""):
if len(border) != 2:
print("parameter 'border' must include exactly 2 symbols!")
return None
print(prefix + border[0] + (filler * int(current / total * length) +
(space * (length - int(current / total * length)))) + border[1], suffix, "\r", end="")
if total == current:
if oncomp:
print(prefix + border[0] + space * int(((length - len(oncomp)) / 2)) +
oncomp + space * int(((length - len(oncomp)) / 2)) + border[1], suffix)
if not oncomp:
print(prefix + border[0] + (filler * int(current / total * length) +
(space * (length - int(current / total * length)))) + border[1], suffix)
as you can see, it have: length of bar, prefix and suffix, filler, space, text in bar on 100%(oncomp) and borders
here an example:
from time import sleep, time
start_time = time()
for i in range(10):
pref = str((i+1) * 10) + "% "
complete_text = "done in %s sec" % str(round(time() - start_time))
sleep(1)
bar(10, i + 1, length=20, prefix=pref, oncomp=complete_text)
out in progress:
30% [###### ]
out on complete:
100% [ done in 9 sec ]
Putting together some of the ideas I found here, and adding estimated time left:
import datetime, sys
start = datetime.datetime.now()
def print_progress_bar (iteration, total):
process_duration_samples = []
average_samples = 5
end = datetime.datetime.now()
process_duration = end - start
if len(process_duration_samples) == 0:
process_duration_samples = [process_duration] * average_samples
process_duration_samples = process_duration_samples[1:average_samples-1] + [process_duration]
average_process_duration = sum(process_duration_samples, datetime.timedelta()) / len(process_duration_samples)
remaining_steps = total - iteration
remaining_time_estimation = remaining_steps * average_process_duration
bars_string = int(float(iteration) / float(total) * 20.)
sys.stdout.write(
"\r[%-20s] %d%% (%s/%s) Estimated time left: %s" % (
'='*bars_string, float(iteration) / float(total) * 100,
iteration,
total,
remaining_time_estimation
)
)
sys.stdout.flush()
if iteration + 1 == total:
print
# Sample usage
for i in range(0,300):
print_progress_bar(i, 300)
Well here is code that works and I tested it before posting:
import sys
def prg(prog, fillchar, emptchar):
fillt = 0
emptt = 20
if prog < 100 and prog > 0:
prog2 = prog/5
fillt = fillt + prog2
emptt = emptt - prog2
sys.stdout.write("\r[" + str(fillchar)*fillt + str(emptchar)*emptt + "]" + str(prog) + "%")
sys.stdout.flush()
elif prog >= 100:
prog = 100
prog2 = prog/5
fillt = fillt + prog2
emptt = emptt - prog2
sys.stdout.write("\r[" + str(fillchar)*fillt + str(emptchar)*emptt + "]" + str(prog) + "%" + "\nDone!")
sys.stdout.flush()
elif prog < 0:
prog = 0
prog2 = prog/5
fillt = fillt + prog2
emptt = emptt - prog2
sys.stdout.write("\r[" + str(fillchar)*fillt + str(emptchar)*emptt + "]" + str(prog) + "%" + "\nHalted!")
sys.stdout.flush()
Pros:
20 character bar (1 character for every 5 (number wise))
Custom fill characters
Custom empty characters
Halt (any number below 0)
Done (100 and any number above 100)
Progress count (0-100 (below and above used for special functions))
Percentage number next to bar, and it's a single line
Cons:
Supports integers only (It can be modified to support them though, by making the division an integer division, so just change prog2 = prog/5 to prog2 = int(prog/5))
Here's my Python 3 solution:
import time
for i in range(100):
time.sleep(1)
s = "{}% Complete".format(i)
print(s,end=len(s) * '\b')
'\b' is a backslash, for each character in your string.
This does not work within the Windows cmd window.

Geotagging JPEGs with pyexiv2

I am geotagging JPEGs using the pyexiv2 Python module using code I found in another SO answer (see: What is the best way to geotag jpeg-images using Python?) and I have a question about the GPSTag value.
The code given in the answer has the following lines:
exiv_image["Exif.Image.GPSTag"] = 654
exiv_image["Exif.GPSInfo.GPSMapDatum"] = "WGS-84"
exiv_image["Exif.GPSInfo.GPSVersionID"] = '2 0 0 0'
I have looked at the Exiv2 documentation and found descriptions of GPSTag, GPSMapDatum, and GPSVersionID but am still confused about the value for GPSTag.
From the documentation it says:
A pointer to the GPS Info IFD. The Interoperability structure of the GPS Info IFD, like that of Exif IFD, has no image data.
This description does not really explain how to determine what value to use and I have not been able to find a better description of GPSTag online.
So my questions are:
Given a new image, how do you determine the value of Exif.Image.GPSTag?
Why is the code sample using a value of 654 (this may be answered by question one)?
Thanks for your help.
Best way to geotag photos using pyexiv2 is definitely with my program, GottenGeography ;-)
But seriously though, if you're wanting to access GPS data from pyexiv2, that code looks like this:
GPS = 'Exif.GPSInfo.GPS'
try:
self.latitude = dms_to_decimal(
*self.exif[GPS + 'Latitude'].value +
[self.exif[GPS + 'LatitudeRef'].value]
)
self.longitude = dms_to_decimal(
*self.exif[GPS + 'Longitude'].value +
[self.exif[GPS + 'LongitudeRef'].value]
)
except KeyError:
pass
try:
self.altitude = float(self.exif[GPS + 'Altitude'].value)
if int(self.exif[GPS + 'AltitudeRef'].value) > 0:
self.altitude *= -1
except KeyError:
pass
And writing looks like this:
self.exif[GPS + 'AltitudeRef'] = '0' if self.altitude >= 0 else '1'
self.exif[GPS + 'Altitude'] = Fraction(self.altitude)
self.exif[GPS + 'Latitude'] = decimal_to_dms(self.latitude)
self.exif[GPS + 'LatitudeRef'] = 'N' if self.latitude >= 0 else 'S'
self.exif[GPS + 'Longitude'] = decimal_to_dms(self.longitude)
self.exif[GPS + 'LongitudeRef'] = 'E' if self.longitude >= 0 else 'W'
self.exif[GPS + 'MapDatum'] = 'WGS-84'
With these support functions:
class Fraction(fractions.Fraction):
"""Only create Fractions from floats.
>>> Fraction(0.3)
Fraction(3, 10)
>>> Fraction(1.1)
Fraction(11, 10)
"""
def __new__(cls, value, ignore=None):
"""Should be compatible with Python 2.6, though untested."""
return fractions.Fraction.from_float(value).limit_denominator(99999)
def dms_to_decimal(degrees, minutes, seconds, sign=' '):
"""Convert degrees, minutes, seconds into decimal degrees.
>>> dms_to_decimal(10, 10, 10)
10.169444444444444
>>> dms_to_decimal(8, 9, 10, 'S')
-8.152777777777779
"""
return (-1 if sign[0] in 'SWsw' else 1) * (
float(degrees) +
float(minutes) / 60 +
float(seconds) / 3600
)
def decimal_to_dms(decimal):
"""Convert decimal degrees into degrees, minutes, seconds.
>>> decimal_to_dms(50.445891)
[Fraction(50, 1), Fraction(26, 1), Fraction(113019, 2500)]
>>> decimal_to_dms(-125.976893)
[Fraction(125, 1), Fraction(58, 1), Fraction(92037, 2500)]
"""
remainder, degrees = math.modf(abs(decimal))
remainder, minutes = math.modf(remainder * 60)
return [Fraction(n) for n in (degrees, minutes, remainder * 60)]
Although I am currently working on an alternative to pyexiv2 that uses GObject introspection to access the exiv2 library much more directly, called GExiv2, and would love to have some feedback on it. Both gexiv2 and pyexiv2 are wrappers around the same exiv2 library, but the difference is that pyexiv2 is a very large project with lots of glue, only works in python, and is on the brink of abandonment*; whereas gexiv2 is light and nimble, accessible from any programming language, and is well maintained thanks to it's use by Shotwell.
Hope this helps!
* pyexiv2's author, Olivier Tilloy, has asked me for help with maintainership as he no longer has much time
My version, a little lengthy...
from fractions import Fraction
import pyexiv2
try:
metadata = pyexiv2.metadata.ImageMetadata(image_file)
metadata.read();
thumb = metadata.exif_thumbnail
try:
latitude = metadata.__getitem__("Exif.GPSInfo.GPSLatitude")
latitudeRef = metadata.__getitem__("Exif.GPSInfo.GPSLatitudeRef")
longitude = metadata.__getitem__("Exif.GPSInfo.GPSLongitude")
longitudeRef = metadata.__getitem__("Exif.GPSInfo.GPSLongitudeRef")
latitude = str(latitude).split("=")[1][1:-1].split(" ");
latitude = map(lambda f: str(float(Fraction(f))), latitude)
latitude = latitude[0] + u"\u00b0" + latitude[1] + "'" + latitude[2] + '"' + " " + str(latitudeRef).split("=")[1][1:-1]
longitude = str(longitude).split("=")[1][1:-1].split(" ");
longitude = map(lambda f: str(float(Fraction(f))), longitude)
longitude = longitude[0] + u"\u00b0" + longitude[1] + "'" + longitude[2] + '"' + " " + str(longitudeRef).split("=")[1][1:-1]
latitude_value = dms_to_decimal(*metadata.__getitem__("Exif.GPSInfo.GPSLatitude").value + [metadata.__getitem__("Exif.GPSInfo.GPSLatitudeRef").value]);
longitude_value = dms_to_decimal(*metadata.__getitem__("Exif.GPSInfo.GPSLongitude").value + [metadata.__getitem__("Exif.GPSInfo.GPSLongitudeRef").value]);
print "--- GPS ---"
print "Coordinates: " + latitude + ", " + longitude
print "Coordinates: " + str(latitude_value) + ", " + str(longitude_value)
print "--- GPS ---"
except Exception, e:
print "No GPS Information!"
#print e
# Check for thumbnail
if(thumb.data == ""):
print "No thumbnail!"
except Exception, e:
print "Error processing image..."
print e;

Categories