Extracting frequencies from a wav file python - python

I am familiar with python but new to numpy, so please pardon me if I am wrong.
I am trying to read a .wav file having multiple frequencies (separated by silence). So far I've been able to read the values and find the various parts in the file where there is a sound. Then, I am trying to find the Discrete Cosine Transform and calculate the frequencies from it (ref: how to extract frequency associated with fft values in python)
However, I'm getting an error:
index 46392 is out of bounds for axis 0 with size 25
Here's my code:
import wave
import struct
import numpy as np
def isSilence(windowPosition):
sumVal = sum( [ x*x for x in sound[windowPosition:windowPosition+windowSize+1] ] )
avg = sumVal/(windowSize)
if avg <= 0.0001:
return True
else:
return False
#read from wav file
sound_file = wave.open('test.wav', 'r')
file_length = sound_file.getnframes()
data = sound_file.readframes(file_length)
sound_file.close()
#data = struct.unpack("<h", data)
data = struct.unpack('{n}h'.format(n=file_length), data)
sound = np.array(data)
#sound is now a list of values
#detect silence and notes
i=0
windowSize = 2205
windowPosition = 0
listOfLists = []
listOfLists.append([])
maxVal = len(sound) - windowSize
while True:
if windowPosition >= maxVal:
break
if not isSilence(windowPosition):
while not isSilence(windowPosition):
listOfLists[i].append(sound[windowPosition:windowPosition+ windowSize+1])
windowPosition += windowSize
listOfLists.append([]) #empty list
i += 1
windowPosition += windowSize
frequencies = []
#Calculating the frequency of each detected note by using DFT
for signal in listOfLists:
if not signal:
break
w = np.fft.fft(signal)
freqs = np.fft.fftfreq(len(w))
l = len(signal)
#imax = index of first peak in w
imax = np.argmax(np.abs(w))
fs = freqs[imax]
freq = imax*fs/l
frequencies.append(freq)
print frequencies
Edit: Here is the traceback:
Traceback (most recent call last):
File "final.py", line 61, in <module>
fs = freqs[imax]
IndexError: index 46392 is out of bounds for axis 0 with size 21

The problem was that I assumed listOfLists was actually a list of lists, but actually it was a list of list of lists. The line:
listOfLists[i].append(sound[windowPosition:windowPosition+ windowSize+1])
was appending a list everytime, but I assumed it was appending the elements to existing list.
For instance, if listOfLists was:
[ [1,2,3] ]
Then, listOfLists[0].append([4,5,6]) would give:
[ [ [1,2,3],[4,5,6] ] ]
But I was expecting:
[ [1,2,3,4,5,6] ]
Replacing the problematic line with the code below worked for me:
for v in sound[windowPosition:windowPosition+windowSize+1]:
listOfLists[i].append(v)

Related

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed. Works for first two loops

Let me start by saying that I know this error message has posts about it, but I'm not sure what's wrong with my code. The block of code works just fine for the first two loops, but then fails. I've even tried removing the first two loops from the data to rule out issues in the 3rd loop, but no luck. I did have it set to print out the unsorted temporary list, and it just prints an empty array for the 3rd loop.
Sorry for the wall of comments in my code, but I'd rather have each line commented than cause confusion over what I'm trying to accomplish.
TL;DR: I'm trying to find and remove outliers from a list of data, but only for groups of entries that have the same number in column 0.
Pastebin with data
import numpy as np, csv, multiprocessing as mp, mysql.connector as msc, pandas as pd
import datetime
#Declare unsorted data array
d_us = []
#Declare temporary array for use in loop
tmp = []
#Declare sorted data array
d = []
#Declare Sum variable
tot = 0
#Declare Mean variable
m = 0
#declare sorted final array
sort = []
#Declare number of STDs
t = 1
#Declare Standard Deviation variable
std = 0
#Declare z-score variable
z_score
#Timestamp for output files
nts = datetime.datetime.now().timestamp()
#Create output file
with open(f"calib_temp-{nts}.csv", 'w') as ctw:
pass
#Read data from CSV
with open("test.csv", 'r', newline='') as drh:
fr_rh = csv.reader(drh, delimiter=',')
for row in fr_rh:
#append data to unsorted array
d_us.append([float(row[0]),float(row[1])])
#Sort array by first column
d = np.sort(d_us)
#Calculate the range of the data
l = round((d[-1][0] - d[0][0]) * 10)
#Declare the starting value
s = d[0][0]
#Declare the ending value
e = d[-1][0]
#Set the while loop counter
n = d[0][0]
#Iterate through data
while n <= e:
#Create array with difference column
for row in d:
if row[0] == n:
diff = round(row[0] - row[1], 1)
tmp.append([row[0],row[1],diff])
#Convert to numpy array
tmp = np.array(tmp)
#Sort numpy array
sort = tmp[np.argsort(tmp[:,2])]
#Calculate sum of differences
for row in tmp:
tot = tot + row[2]
#Calculate mean
m = np.mean(tot)
#Calculate Standard Deviation
std = np.std(tmp[:,2])
#Calculate outliers and write to output file
for y in tmp:
z_score = (y[2] - m)/std
if np.abs(z_score) > t:
with open(f"calib_temp-{nts}.csv", 'a', newline='') as ct:
c = csv.writer(ct, delimiter = ',')
c.writerow([y[0],y[1]])
#Reset Variables
tot = 0
m = 0
n = n + 0.1
tmp = []
std = 0
z_score = 0
Do this before the loop:
#Create output file
ct = open(f"calib_temp-{nts}.csv", 'w')
c = csv.writer(ct, delimiter = ',')
Then change the loop to this. Note that I have moved your initializations to the top of the loop, so you don't need to initialize them twice. Note the if tmp: line, which solves the numpy exception.
#Iterate through data
while n <= e:
tot = 0
m = 0
tmp = []
std = 0
z_score = 0
#Create array with difference column
for row in d:
if row[0] == n:
diff = round(row[0] - row[1], 1)
tmp.append([row[0],row[1],diff])
#Sort numpy array
if tmp:
#Convert to numpy array
tmp = np.array(tmp)
sort = tmp[np.argsort(tmp[:,2])]
#Calculate sum of differences
for row in tmp:
tot = tot + row[2]
#Calculate mean
m = np.mean(tot)
#Calculate Standard Deviation
std = np.std(tmp[:,2])
#Calculate outliers and write to output file
for y in tmp:
z_score = (y[2] - m)/std
if np.abs(z_score) > t:
c.writerow([y[0],y[1]])
#Reset Variables
n = n + 0.1

Why is my interpolation not working properly in my function?

I have a fairly long code that processes spectra, and along the way I need an interpolation of some points. I used to have all this code written line-by-line without any functions, and it all worked properly, but now I'm converting it to two large functions so that I can call it on other models more easily in the future. Below is my code (I have more code after the last line here that plots some things, but that's not relevant to my issue, since I've tested this with a bunch of print lines and learned that my issue arises when I call the interpolation function inside my process function.
import re
import numpy as np
import scipy.interpolate
# Required files and lists
filename = 'bpass_spectra.txt' # number of columns = 4
extinctionfile = 'ExtinctionLawPoints.txt' # R_V = 4.0
datalist = []
if filename == 'bpass_spectra.txt':
filetype = 4
else:
filetype = 1
if extinctionfile == 'ExtinctionLawPoints.txt':
R_V = 4.0
else:
R_V = 1.0 #to be determined
# Constants
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Inputs
beta = 2.0 # power used in extinction law
R = 1.0 # star formation rate [Msun/yr]
z = 1.0 # redshift
M_gas = 1.0 # mass of gas
M_halo = 2e41 # mass of dark matter halo
# Read spectra file
f = open(filename, 'r')
rawlines = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawlines[0])
del rawlines[0]
for i in range(len(rawlines)):
newlist = rawlines[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
def interpolate(R_V, rawpoints, Elist, i):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[i])
# Processing function
def process(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met):
speclist = []
if filetype == 4:
metallicity = float(met[0])
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
met1list = [float(item[1]) for item in datalist]
speclist.extend(met1list)
klist, Tlist = [None]*len(speclist), [None]*len(speclist)
if metallicity > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*metallicity]*len(speclist) # dust to gas ratio
elif metallicity <= 0.0052:
DGRlist = [((50.0*metallicity)**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if Elist[i] <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**beta # extinction law [cm^2/g]
elif Elist[i] > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = interpolate(R_V, rawpoints, Elist, i) # interpolated function's value at Elist[i]
print "KLIST (INTERPOLATION) ELEMENTS 0 AND 1000:", klist[0], klist[1000]
return
The output from the print line is KLIST (INTERPOLATION) ELEMENTS 0 AND 1000: 52167.31734159269 52167.31734159269.
When I run my old code without functions, I print klist[0] and klist[1000] like I do here and get different values for each. In this new code, I get back two values that are the same from this line. This shouldn't be the case, so it must not be interpolating correctly inside my function (maybe it's not performing it on each point correctly in the loop?). Does anyone have any insight? It would be unreasonable to post my entire code with all the used text files here (they're very large), so I'm not expecting anyone to run it, but rather examine how I use and call my functions.
Edit: Below is the original version of my code up to the interpolation point without the functions (which works).
import re
import numpy as np
import scipy.interpolate
filename = 'bpass_spectra.txt'
extinctionfile = 'ExtinctionLawPoints.txt' # from R_V = 4.0
pointslist = []
datalist = []
speclist = []
# Constants
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Read spectra file
f = open(filename, 'r')
rawspectra = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawspectra[0])
del rawspectra[0]
for i in range(len(rawspectra)):
newlist = rawspectra[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
# Create new lists
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
z1list = [float(item[1]) for item in datalist]
speclist.extend(z1list)
met = met[0]
klist = [None]*len(speclist)
Loutlist = [None]*len(speclist)
Tlist = [None]*len(speclist)
# Define parameters
b = 2.0 # power used in extinction law (beta)
R = 1.0 # star formation ratw [Msun/yr]
z = 1.0 # redshift
Mgas = 1.0 # mass of gas
Mhalo = 2e41 # mass of dark matter halo
if float(met) > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*float(met)]*len(speclist)
elif float(met) <= 0.0052:
DGRlist = [((50.0*float(met))**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if float(Elist[i]) <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**b # extinction law [cm^2/g]
elif float(Elist[i]) > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = k_interp(Elist[i]) # interpolated function's value at Elist[i]
print "KLIST (INTERPOLATION) ELEMENTS 0 AND 1000:", klist[0], klist[1000]
The output from this print line is KLIST (INTERPOLATION) ELEMENTS 0 AND 1000 7779.275435560996 58253.589270674354.
You are passing i as an argument to interpolate, and then also using i in a loop within interpolate. Once i is used within the for i in range(len(rawpoints)) loop in interpolate, it will be set to some value: len(rawpoints)-1. The interpolate function will then always return the same value k_interp(Elist[i]), which is equivalent to k_interp(Elist[len(rawpoints)-1]). You will need to either define a new variable within your loop (e.g. for not_i in range(len(rawpoints))), or use a different variable for the Elist argument. Consider the following change to interpolate:
def interpolate(R_V, rawpoints, Elist, j):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[j])

TypeError: a bytes-like object is required, not 'str', laspy

I am new to programming and wanted to convert Las file into grid file using laspy. It keeps giving error
"TypeError: a bytes-like object is required, not 'str'".
I know fmt gives a string, so I tried fmt = '%1.2f'.encode() to change in to binary, but got the same error.
from laspy.file import File
import numpy as np
source = "/655-7878.las"
target = "/lidar.asc"
cell = 1.0
NODATA = 0
las = File(source, mode = "r")
#xyz min and max
min = las.header.min
max = las.header.max
#Get the x axis distance
xdist = max[0] - min[0]
#Get the y axis distance
ydist = max[1] - min[1]
#Number of columns for our grid
cols = int((xdist)/cell)
#Number of rows for our grid
rows = int((ydist)/cell)
cols += 1
rows += 1
#Track how many elevation
#values we aggregate
count = np.zeros((rows, cols)).astype(np.float32)
#Aggregate elevation values
zsum = np.zeros((rows, cols)).astype(np.float32)
#Y resolution is negative
ycell = -1 * cell
#Project x,y values to grid
projx =(las.x -min[0]) / cell
projy = (las.y - min[1])/ ycell
#Cas to integers and clip for use as index
ix = projx.astype(np.int32)
iy = projy.astype(np.int32)
#Loop through x,y,z arrays, add to grid shape and aggregate values for averaging
for x,y,z in np.nditer([ix, iy, las.z]):
count[y, x] +=1
zsum[y, x]+=z
# Change 0 values to 1 to avoid numpy warnings and NaN values in array
nonzero = np.where(count>0, count, 1)
#Average our z values
zavg = zsum/nonzero
#Interpolate 0 values in array to avoid any holes in the grid
mean = np.ones((rows, cols)) * np.mean(zavg)
left = np.roll(zavg, -1,1)
lavg = np.where(left>0, left, mean)
right = np.roll(zavg, 1, 1)
ravg = np.where(right>0, right, mean)
interpolate = (lavg + ravg)/2
fill = np.where(zavg>0, zavg, interpolate)
#Create ASCII DEM header
header = "ncols %s\n" % fill.shape[1]
header += "nrows %s\n" % fill.shape[0]
header += "xllcorner %s\n" % min[0]
header += "yllcorner %s\n" % min[1]
header += "cellsize %s\n" % cell
header += "NODATA_value %s\n" % NODATA
#Open the output file, add the header, save the array
with open(target, "wb") as f:
f.write(header)
# The fmt string ensures we output floats
#That have at least one number but only two decimal places
np.savetxt(f, fill, fmt = '%1.2f')`
Can someone please help me to sort it out.
f.write(bytes(header, 'UTF-8'))
if you are using python3 when you open a file with 'b' you can't write strings to the file, only raw binary data . if you have a string you want to write to the file you should either open it in text mode (without 'b') or convert it to a bytearray()
so writing to file would look like this:
with open(target, "wb") as f:
f.write(bytearray(header,'utf-8'))

sum( array, 1) giving 'nan' in Python

First of all i know nan stands for "not a number" but I am not sure how i am getting an invalid number in my code. What i am doing is using a python script that reads a file for a list of vectors (x,y,z) and then converts it to a long array of values, but if i don't use the file and i make a for loop that generates random numbers i don't get any 'nan's.
After this i am using Newtons law of gravity to calculate the pos of stars, F= GMm/r^2 to calculate positions and then that data gets sent through a socket server to my c# visualizing software that i developed for watching simulations. Unfortuanately my python script that does the calculating has only but been troublesome to get working.
poslist = []
plist = []
mlist = []
lineList = []
coords = []
with open("Hyades Vectors.txt", "r") as text_file:
content = text_file.readlines()
#remove /n
for i in range(len(content)):
for char in "\n":
line = content[i].replace(char,"")
lineList.append(line)
lines = array(lineList)
#split " " within each line
for i in range(len(lines)):
coords.append(lines[i].split(" "))
coords = array(coords)
#convert coords string to integer
for i in range(len(coords)):
x = np.float(coords[i,0])
y = np.float(coords[i,1])
z = np.float(coords[i,2])
poslist.append((x,y,z))
pos = array(poslist)
quite often it is sending nan's after the second time going through this loop
vcm = sum(p)/sum(m) #velocity of centre mass
p = p-m*vcm #make total initial momentum equal zero
Myr = 8.4
dt = 1
pos = pos-(p/m)*(dt/2.) #initial half-step
finished = False
while not finished: # or NBodyVis.Oppenned() == False
r = pos-pos[:,newaxis] #all pairs of star-to-star vectors
for n in range(Nstars):
r[n,n] = 1e6 #otherwise the self-forces are infinite
rmag = sqrt(sum(square(r),-1)) #star-to star scalar distances
F = G*m*m[:,newaxis]*r/rmag[:,:,newaxis]**3 # all force pairs
for n in range(Nstars):
F[n,n] = 5 # no self-forces
p = p+sum(F,1)*dt #sum(F,1) is where i get a nan!!!!!!!!!!!!!!!!
pos -= (p/m)*dt
if Time <= 0:
finished = True
else:
Time -= 1
What am i doing wrong?????? I don't fully understand nans but i can't have them if my visualizing software is to read a nan, as for then nothing will apear for visuals. I know that the error is sum(F,1) I went and printed everything through until i got a nan and that is where, but how is it getting a nan from summing. Here is what part of the text file looks like that i am reading:
51.48855 4.74229 -85.24499
121.87149 11.44572 -140.79644
59.81673 68.8417 18.76767
31.95567 37.23007 6.59515
29.81066 34.76371 6.18374
41.35333 49.52844 14.12314
32.10481 38.46982 7.96628
48.13239 60.4019 37.45474
26.37793 34.53385 15.9054
76.02468 103.98826 25.96607
51.52072 71.17618 32.09829
please help

Looping back to the next column in a csv

I'm trying to get a script to run on each individual column of a csv file. I've figured out how to tell python which column I would like to run the script on but I want it to analyze column one, output the results, the move to column two and continue on and on through the file. What I want is a "if etc goto etc" command. I've found how to do this with simple oneliners but I have a larger script. Any help would be great as I'm sure I'm just missing something. Like if I could loop back to where I define my data (h=data) but tell it to choose the next column. Here is my script.
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize
a=raw_input("Data file name? ") #Name of the data file including the directory, must be .csv
datafile = open(a, 'r')
data = []
for row in datafile:
data.append(row.strip().split(',')) #opening and organizing the csv file
print('Data points= ', len(data))
print data
c=raw_input("Is there a header row? y/n?") #Remove header line if present
if c is ('y'):
del data[0]
data2=data
print('Raw data= ', data2)
else:
print('Raw data= ', data)
'''
#if I wanted to select a column
b=input("What column to analyze?") #Asks what column depth data is in
if b is 1:
h=[[rowa[i] for rowa in data] for i in range(1)] #first row
'''
h=data # all columns
g=reduce(lambda x,y: x+y,h) #prepares data for calculations
a=map(float, g)
a.sort()
print ('Organized data= ',a)
def GRLC(values):
'''
Calculate Gini index, Gini coefficient, Robin Hood index, and points of
Lorenz curve based on the instructions given in
www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
#param values: List of values
#return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]]
'''
n = len(values)
assert(n > 0), 'Empty list of values'
sortedValues = sorted(values) #Sort smallest to largest
#Find cumulative totals
cumm = [0]
for i in range(n):
cumm.append(sum(sortedValues[0:(i + 1)]))
#Calculate Lorenz points
LorenzPoints = [[], []]
sumYs = 0 #Some of all y values
robinHoodIdx = -1 #Robin Hood index max(x_i, y_i)
for i in range(1, n + 2):
x = 100.0 * (i - 1)/n
y = 100.0 * (cumm[i - 1]/float(cumm[n]))
LorenzPoints[0].append(x)
LorenzPoints[1].append(y)
sumYs += y
maxX_Y = x - y
if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y
giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index
return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]
result = GRLC(a)
print 'Gini Index', result[0]
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]
I'm ignoring all of that GRLC function and just solving the looping question. Give this a try. It uses while True: to loop forever (you can just break out by ending the program; Ctrl+C in Windows, depends on OS). Just load the data from the csv once then each time it loops, you can re-build some variables. If you have questions please ask. Also, I didn't test it as I don't have all the NumPy packages installed :)
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize
def GRLC(values):
'''
Calculate Gini index, Gini coefficient, Robin Hood index, and points of
Lorenz curve based on the instructions given in
www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
#param values: List of values
#return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]]
'''
n = len(values)
assert(n > 0), 'Empty list of values'
sortedValues = sorted(values) #Sort smallest to largest
#Find cumulative totals
cumm = [0]
for i in range(n):
cumm.append(sum(sortedValues[0:(i + 1)]))
#Calculate Lorenz points
LorenzPoints = [[], []]
sumYs = 0 #Some of all y values
robinHoodIdx = -1 #Robin Hood index max(x_i, y_i)
for i in range(1, n + 2):
x = 100.0 * (i - 1)/n
y = 100.0 * (cumm[i - 1]/float(cumm[n]))
LorenzPoints[0].append(x)
LorenzPoints[1].append(y)
sumYs += y
maxX_Y = x - y
if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y
giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index
return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]
#Name of the data file including the directory, must be .csv
a=raw_input("Data file name? ")
datafile = open(a.strip(), 'r')
data = []
#opening and organizing the csv file
for row in datafile:
data.append(row.strip().split(','))
#Remove header line if present
c=raw_input("Is there a header row? y/n?")
if c.strip().lower() == ('y'):
del data[0]
while True :
#if I want the first column, that's index 0.
b=raw_input("What column to analyze?")
# Validate that the column input data is correct here. Otherwise it might be out of range, etc.
# Maybe try this. You might want more smarts in there, depending on your intent:
b = int(b.strip())
# If you expect the user to inpt "2" to mean the second column, you're going to use index 1 (list indexes are 0 based)
h=[[rowa[b-1] for rowa in data] for i in range(1)]
# prepares data for calculations
g=reduce(lambda x,y: x+y,h)
a=map(float, g)
a.sort()
print ('Organized data= ',a)
result = GRLC(a)
print 'Gini Index', result[0]
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]

Categories