Bootstrap - Confidence Interval Calculation - python

I am trying to implement bootstrap to estimate CI for statistics. Here is the code I have written
import numpy as np
import numpy.random as npr
import pylab
def bootstrap(data, num_samples, statistic, alpha):
"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""
num_samples = len(data)
idx = npr.randint(min(data), max(data), num_samples)
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
X,Y = np.loadtxt('data/ABC.txt',
unpack =True,
delimiter =',',
skiprows = 1)
The text file contains 2 columns and I need to calculate the confidence interval for both columns.
My first thought is to convert the columns into an array and calculate the high and low 95% CI. I was thinking of something like this:
data = np.array([X,Y])
low, high = bootstrap(X, len(data), np.mean, 0.05)
low1, high1 = bootstrap(Y, len(data), np.mean, 0.05)
But I am not sure if this the correct way of calculating confidence interval. Can someone help me with this?
Thank you in advance!

Instead of :
idx = npr.randint(min(data), max(data), num_samples)
Use:
idx=np.random.choice(data,size=len(data),replace=True)

Related

How can I get the start and end indices of a note in a volume graph?

I am trying to make a program, that tells me when a note has been pressed.
I have the following notes exported as a .wav file (The C Major Scale 4 times with different rhythms, dynamics and in different octaves):
I can get the volumes of my sound file using the following code:
from scipy.io import wavfile
def get_volume(file):
sr, data = wavfile.read(file)
if data.ndim > 1:
data = data[:, 0]
return data
volumes = get_volume("FILE")
Here are some information about the output:
Max: 27851
Min: -25664
Mean: -0.7569383391943734
A Sample from the array: [ -7987 -8615 -8983 -9107 -9019 -8750 -8324 -7752 -7033 -6156
-5115 -3920 -2610 -1245 106 1377 2520 3515 4364 5077
5659 6113 6441 6639 6708 6662 6518 6288 5962 5525
4963 4265 3420 2418 1264 -27 -1429 -2901 -4388 -5814
-7101 -8186 -9028 -9614 -9955 -10077 -10012 -9785 -9401 -8846]
And here is what I get when I plot the volumes array (x is the index, y is the volume):
I want to get the indices of the start and end of the notes like the ones in the image (Did it by hand not accurate):
When I looked at the data I realized, that it is a 1d array and I also noticed, that when a note gets louder or quiter it is not smooth. It is like a ZigZag, but there is still a trend. So basically I can't just get the gradients (slope) of each point. So I though about grouping notes into batches and getting the average gradient there and thus doing the calculations with it, like so:
def get_average_gradient(arr):
# Calculates average gradient
return sum([i - (sum(arr) / len(arr)) for i in arr]) / len(arr)
def get_note_start_end(arr_size, batch_size, arr):
# Finds start and end indices
ranges = []
curr_range = [0]
prev_slope = curr_slope = "NO SLOPE"
has_ended = False
for i, j in enumerate(arr):
if j > 0:
curr_slope = "INCREASING"
elif j < 0:
curr_slope = "DECREASING"
else:
curr_slope = "NO SLOPE"
if prev_slope == "DECREASING" and not has_ended:
if i == len(arr) - 1 or arr[i + 1] < 0:
if curr_slope != "DECREASING":
curr_range.append((i + 1) * batch_size + batch_size)
ranges.append(curr_range)
curr_range = [(i + 1) * batch_size + batch_size + 1]
has_ended = True
if has_ended and curr_slope == "INCREASING":
has_ended = False
prev_slope = curr_slope
ranges[-1][-1] = arr_size - 1
return ranges
def get_notes(batch_size, arr):
# Gets the gradients of the batches
out = []
for i in range(0, len(arr), batch_size):
if i + batch_size > len(arr):
gradient = get_average_gradient(arr[i:])
else:
gradient = get_average_gradient(arr[i: i+batch_size])
# print(gradient, i)
out.append(gradient)
return get_note_start_end(len(arr), batch_size, out)
notes = get_notes(128, volumes)
The problem with this is, that if the batch size is too small, then it returns the indices of small peaks, which aren't a note on their own. If the batch size is too big then the program misses the start and end indices.
I also tried to get the notes, by using the silence.
Here is the code I used:
from pydub import AudioSegment, silence
audio = intro = AudioSegment.from_wav("C - Major - Test.wav")
dBFS = audio.dBFS
notes = silence.detect_nonsilent(audio, min_silence_len=50, silence_thresh=dBFS-10)
This worked the best, but it still wasn't good enough. Here is what I got:
It some notes pretty well, but it wasn't able to identify notes accurately if the notes themselves didn't become very quite before a different one was played (Like in the second scale and in the fourth scale).
I have been thinking about this problem for days and I have basically tried most if not all of the good(?) ideas I had. I am new to analysing audio files. Maybe I am using the wrong data to do what I want to do. Maybe I need to use the frequency data (I tried getting it, but couldn't make sense of it)
Frequency code:
from scipy.fft import *
from scipy.io import wavfile
import matplotlib.pyplot as plt
def get_freq(file, start_time, end_time):
sr, data = wavfile.read(file)
if data.ndim > 1:
data = data[:, 0]
else:
pass
# Fourier Transform
N = len(data)
yf = rfft(data)
xf = rfftfreq(N, 1 / sr)
return xf, yf
FILE = "C - Major - Test.wav"
plt.plot(*get_freq(FILE, 0, 10))
plt.show()
And the frequency graph:
And here is the .wav file:
https://drive.google.com/file/d/1CERH-eovu20uhGoV1_O3B2Ph-4-uXpiP/view?usp=sharing
Any help is appreciated :)
think this is what you need:
first you convert negative numbers into positive ones and smooth the line to eliminate noise, to find the lower peaks yo work with the negative values.
from scipy.io import wavfile
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import numpy as np
from scipy.signal import savgol_filter
def get_volume(file):
sr, data = wavfile.read(file)
if data.ndim > 1:
data = data[:, 0]
return data
v1 = abs(get_volume("test.wav"))
#Smooth the curve
volumes=savgol_filter(v1,10000 , 3)
lv=volumes*-1
#find peaks
peaks,_ = find_peaks(volumes,distance=8000,prominence=300)
lpeaks,_= find_peaks(lv,distance=8000,prominence=300)
# plot them
plt.plot(volumes)
plt.plot(peaks,volumes[peaks],"x")
plt.plot(lpeaks,volumes[lpeaks],"o")
plt.plot(np.zeros_like(volumes), "--", color="gray")
plt.show()
Plot with your test file, x marks the high peaks and o the lower peaks
This article presents two python libraries (Aubio, librosa) to achieve what you need and includes examples of how to use them: How to Use Python to Detect Music Onsets by Lynn Zheng

Generate values in separate dataframe

I trying to generate random data with Pandas.
Data is need to be stored in two columns. The first column needs to contain categorical variables (from Stratum_1 until Stratum_19) each of these stratums can contain a random number of values.
Second column needs to have data in the range between 1 to 180000000 with a standard deviation of 453210, a mean of 170000, and a number of rows 100000.
I try to
categorical = {'name': ['Stratum_1','Stratum_2','Stratum_3','Stratum_4','Stratum_5','Stratum_6','Stratum_7','Stratum_8','Stratum_9',
'Stratum_10','Stratum_11','Stratum_12','Stratum_13','Stratum_14','Stratum_15','Stratum_16','Stratum_17','Stratum_18','Stratum_19']}
desired_mean = 170000
desired_std_dev = 453210
df = pd.DataFrame(np.random.randint(0,180000000,size=(100000, 1)),columns=list('1'))
I tried with this code above but don't know how to implement categorical and numerical values together with desired mean and standard deviation. So can anybody help how to solve this problem and generate?
I decided to use the gamma distribution to generate your desired sample after thinking that the given parameters are not suitable for the normal distribution.
Code
import numpy as np
import pandas as pd
# desired parameters
n_rows = 100000
lower, upper = 1, 180000000
mu, sigma = 170000, 453210
# amount of shift
delta = lower
# parameters for the gamma distribution
shape = ((mu - delta) / sigma) ** 2
scale = sigma**2 / (mu - delta)
# Create a dataframe
categories = {'name': [f'Stratum_{i}' for i in range(1, 19 + 1)]}
df = pd.DataFrame(categories).sample(n=n_rows, replace=True).reset_index(drop=True)
# Generate samples along with your desired parameters
generator = np.random.default_rng()
while True:
df['value'] = generator.gamma(shape=shape, scale=scale, size=n_rows) + delta
if df.value.max() <= upper:
break
# Show statistics
print(df.describe())
Output
value
count
100,000
mean
169,403 (Target: 170,000)
std
449,668 (Target: 453,210)
min
1
25%
39.4267
50%
5529.28
75%
105,748
max
9.45114e+06
Try:
import numpy as np
categorical = {'name': ['Stratum_1','Stratum_2','Stratum_3','Stratum_4','Stratum_5','Stratum_6','Stratum_7','Stratum_8','Stratum_9',
'Stratum_10','Stratum_11','Stratum_12','Stratum_13','Stratum_14','Stratum_15','Stratum_16','Stratum_17','Stratum_18','Stratum_19']}
desired_mean = 170000
desired_std_dev = 453210
df = pd.DataFrame({'num':np.random.normal(170000, 453210,size=(300000, 1)).reshape(-1), 'cat':np.random.choice(categorical['name'], 300000)})
df[(0<df['num'])&(df['num']<180000000)].sample(100000)
result:

Gaussian fit to noisey data using curve_fit

I am having issues fitting a Gaussian to my data. Currently the output for my code looks like
this. Where orange is the data, blue is the gaussian fit and green is an in-built gaussian fitter however I do not wish to use it as it never quite begins at zero and I do not have access to the code. I would like my output to look something like this where the drawn in red is the gaussian fit.
I have tried reading about the curve_fit documentation however at best I get a fit that looks like this which fits over all the data, however, this is undesirable as I am only interested in the central peak which is my main issue - I do not know how to get curve_fit to fit a gaussian on the central peak like in the second image.
I have considered using a weights function like np.random.choice() or looking at the data file's maximum value and then looking at the second derivative at either side of the central peak to see where there are changes in inflection but am unsure how best to implement this.
How would I best go about this? I have done a lot of googling but cant quite get my head around changing curve_fit to suit my needs.
Cheers for any pointers!
This is a data file.
https://drive.google.com/file/d/1qrAkD74U6L46GoGnvMiUHdPuLEToS6Pv/view?usp=sharing
This is my code:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from matplotlib.pyplot import figure
plt.close('all')
fpathB4 = 'E:\.1. Work - Current Projects + Old Projects\Current Projects\PF 4MHz Laser System\.8. 1050 SYSTEM\AC traces'
fpath = fpathB4.replace('\\','/') + ('/')
filename = '300'
with open(fpath+filename) as f:
dataraw = f.readlines()
FWHM = dataraw[8].split(':')[1].split()[0]
FWHM = np.float(FWHM)
print("##### For AC file -", filename, "#####")
print("Auto-co guess -", FWHM, "ps")
pulseduration = FWHM/np.sqrt(2)
pulseduration = str(pulseduration)
dataraw = dataraw[15:]
print("Pulse duration -", pulseduration, "ps" + "\n")
time = np.array([])
acf1 = np.array([]) #### DATA
fit = np.array([]) #### Gaussian fit
for k in dataraw:
data = k.split()
time = np.append(time, np.float(data[0]))
acf1= np.append(acf1, np.float(data[1]))
fit = np.append(fit, np.float(data[2]))
n = len(time)
y = acf1.copy()
x = time.copy()
mean = sum(x*y)/n
sigma = sum(y*(x-mean)**2)/n
def gaus(x,a,x0,sigma):
return a*np.exp(-(x-x0)**2/(2*sigma**2))
popt,pcov = curve_fit(gaus,x,y,p0=[1,mean,sigma])
plt.plot(x,gaus(x,*popt)/np.max(gaus(x,*popt)))
figure(num=1, figsize=(8, 3), dpi=96, facecolor='w', edgecolor='k') # figsize = (length, height)
plt.plot(time, acf1/np.max(acf1), label = 'Data - ' + filename, linewidth = 1)
plt.plot(time, fit/np.max(fit), label = '$FWHM_{{\Delta t}}$ (ps) = ' + pulseduration)
plt.autoscale(enable = True, axis = 'x', tight = True)
plt.title("Auto-Correlation Data")
plt.xlabel("Time (ps)")
plt.ylabel("Intensity (a.u.)")
plt.legend()
I think the problem might be that the data are not completely Gaussian-like. It seems you have some kind of Airy/sinc function due to the time resolution of your acquisition instrument. Still, if you are only interested in the center you can still fit it using a single gaussian:
import fitwrap as fw
import pandas as pd
df = pd.read_csv('300', skip_blank_lines=True, skiprows=13, sep='\s+')
def gaussian_no_offset(x, x0=2, sigma=1, amp=300):
return amp*np.exp(-(x-x0)**2/sigma**2)
fw.fit(gaussian_no_offset, df.time, df.acf1)
x0: 2.59158 +/- 0.00828 (0.3%) initial:2
sigma: 0.373 +/- 0.0117 (3.1%) initial:1
amp: 355.02 +/- 9.65 (2.7%) initial:300
If you want to be slightly more precise I can think of a sinc squared function for the peak and a broad gaussian offset. The fit seems nicer, but it really depends on what the data actually represents...
def sinc(x, x0=2.5, amp=300, width=1, amp_g=20, sigma=3):
return amp*(np.sinc((x-x0)/width))**2 + amp_g*np.exp(-(x-x0)**2/sigma**2)
fw.fit(sinc, df.time, df.acf1)
x0: 2.58884 +/- 0.0021 (0.1%) initial:2.5
amp: 303.84 +/- 3.7 (1.2%) initial:300
width: 0.49211 +/- 0.00565 (1.1%) initial:1
amp_g: 81.32 +/- 2.11 (2.6%) initial:20
sigma: 1.512 +/- 0.0351 (2.3%) initial:3
I'd add a constant to the Gaussian equation, and limit the range of that in the bounds parameter of curve fit, so that the graph isn't raised higher.
So your equation would be:
def gaus(y0,x,a,x0,sigma):
return y0 + a*np.exp(-(x-x0)**2/(2*sigma**2))
and the curve_fit bounds would be something like this:
curve_fit(..... ,bounds = [[0,a_min, x0_min, sigma_min],[0.1, a_max, x0_max, sigma_max]])

Distribution plot is showing flat pdf

I tried to plot the Probability Density Function (PDF) plot of my data after finding the best parameters, but the plot is showing a flat line instead of a curve.
Is it a matter of scaling?
Is it a problem of Continuous or Discrete data? Data file is available here
The purpose here is to get the best distribution fittings and then plot PDF function.
My data values are so small like: 0.21, 1.117 .etc. The data statistics and PDF plots are shown below:
My script is given below:
from time import time
from datetime import datetime
start_time = datetime.now()
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
import numpy as np
import pickle
import scipy
import scipy.stats
import matplotlib.pyplot as plt
data= pd.read_csv("line_RXC_data.csv",usecols=['R'],parse_dates=True, squeeze=True)
df=data
y_std=df
# del yy
import warnings
warnings.filterwarnings("ignore")
# Create an index array (x) for data
y=df
#
# Create an index array (x) for data
x = np.arange(len(y))
size = len(y)
#simple visualisation of the data
plt.hist(y)
plt.title("Histogram of resistance ")
plt.xlabel("Resistance data visualization ")
plt.ylabel("Frequency")
plt.show()
y_df = pd.DataFrame(y)
tt=y_df.describe()
print(tt)
dist_names = [
'foldcauchy',
'beta',
'expon',
'exponnorm',
'norm',
'lognorm',
'dweibull',
'pareto',
'gamma'
]
x = np.arange(len(df))
size = len(df)
y_std = df
y=df
chi_square = []
p_values = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
s1 = time()
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
# print("1")
param = dist.fit(y_std)
# print("2")
# Obtain the KS test P statistic, round it to 5 decimal places
p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
p_values.append(p)
# print("3")
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
scale=param[-1])
# print("4")
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
chi_square.append(ss)
print(f"chi_square {distribution} time: {time() - s1}")
# print("std of predicted probability : ", np.std(cum_observed_frequency))
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)
# Report results
print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)
#%%
# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 100
bin_cutoffs = np.linspace(np.percentile(y,0), np.percentile(y,99),number_of_bins)
# Create the plot
plt.figure(figsize=(7, 4))
h = plt.hist(y, bins = bin_cutoffs, color='0.70')
# Get the top three distributions from the previous phase
number_distributions_to_plot = 5
dist_names = results['Distribution'].iloc[0:number_distributions_to_plot]
#%%
# Create an empty list to stroe fitted distribution parameters
parameters = []
# Loop through the distributions ot get line fit and paraemters
for dist_name in dist_names:
# Set up distribution and store distribution paraemters
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y)
parameters.append(param)
# Get line for each distribution (and scale to match observed data)
pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
scale_pdf = np.trapz (h[0], h[1][:-1]) / np.trapz (pdf_fitted, x)
pdf_fitted *= scale_pdf
# Add the line to the plot
plt.plot(pdf_fitted, label=dist_name)
# Set the plot x axis to contain 99% of the data
# This can be removed, but sometimes outlier data makes the plot less clear
plt.xlim(0,np.percentile(y,99))
# Add legend and display plotfig = plt.figure(figsize=(8,5))
plt.legend()
plt.title(u'Data distribution charateristics) \n' )
plt.xlabel(u'Resistance')
plt.ylabel('Frequency )')
plt.show()
# Store distribution paraemters in a dataframe (this could also be saved)
dist_parameters = pd.DataFrame()
dist_parameters['Distribution'] = (
results['Distribution'].iloc[0:number_distributions_to_plot])
dist_parameters['Distribution parameters'] = parameters
# Print parameter results
print ('\nDistribution parameters:')
print ('------------------------')
for index, row in dist_parameters.iterrows():
print ('\nDistribution:', row[0])
print ('Parameters:', row[1] )
If you look at the following categorical frequency analysis, you'll see that you have only 15 distinct values spread across the range with large gaps in between—not a continuum of values. Half the observations have the value 0.211, with another ~36% occurring at the value 1.117, ~8% at 0.194, and ~4% at 0.001. I think it's a mistake to treat this as continuous data.

Pandas finding local max and min

I have a pandas data frame with two columns one is temperature the other is time.
I would like to make third and fourth columns called min and max. Each of these columns would be filled with nan's except where there is a local min or max, then it would have the value of that extrema.
Here is a sample of what the data looks like, essentially I am trying to identify all the peaks and low points in the figure.
Are there any built in tools with pandas that can accomplish this?
The solution offered by fuglede is great but if your data is very noisy (like the one in the picture) you will end up with lots of misleading local extremes. I suggest that you use scipy.signal.argrelextrema() method. The .argrelextrema() method has its own limitations but it has a useful feature where you can specify the number of points to be compared, kind of like a noise filtering algorithm. for example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.signal import argrelextrema
# Generate a noisy AR(1) sample
np.random.seed(0)
rs = np.random.randn(200)
xs = [0]
for r in rs:
xs.append(xs[-1] * 0.9 + r)
df = pd.DataFrame(xs, columns=['data'])
n = 5 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.data.values, np.less_equal,
order=n)[0]]['data']
df['max'] = df.iloc[argrelextrema(df.data.values, np.greater_equal,
order=n)[0]]['data']
# Plot results
plt.scatter(df.index, df['min'], c='r')
plt.scatter(df.index, df['max'], c='g')
plt.plot(df.index, df['data'])
plt.show()
Some points:
you might need to check the points afterward to ensure there are no twine points very close to each other.
you can play with n to filter the noisy points
argrelextrema returns a tuple and the [0] at the end extracts a numpy array
Assuming that the column of interest is labelled data, one solution would be
df['min'] = df.data[(df.data.shift(1) > df.data) & (df.data.shift(-1) > df.data)]
df['max'] = df.data[(df.data.shift(1) < df.data) & (df.data.shift(-1) < df.data)]
For example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Generate a noisy AR(1) sample
np.random.seed(0)
rs = np.random.randn(200)
xs = [0]
for r in rs:
xs.append(xs[-1]*0.9 + r)
df = pd.DataFrame(xs, columns=['data'])
# Find local peaks
df['min'] = df.data[(df.data.shift(1) > df.data) & (df.data.shift(-1) > df.data)]
df['max'] = df.data[(df.data.shift(1) < df.data) & (df.data.shift(-1) < df.data)]
# Plot results
plt.scatter(df.index, df['min'], c='r')
plt.scatter(df.index, df['max'], c='g')
df.data.plot()
using Numpy
ser = np.random.randint(-40, 40, 100) # 100 points
peak = np.where(np.diff(ser) < 0)[0]
or
double_difference = np.diff(np.sign(np.diff(ser)))
peak = np.where(double_difference == -2)[0]
using Pandas
ser = pd.Series(np.random.randint(2, 5, 100))
peak_df = ser[(ser.shift(1) < ser) & (ser.shift(-1) < ser)]
peak = peak_df.index
You can do something similar to Foad's .argrelextrema() solution, but with the Pandas .rolling() function:
# Find local peaks
n = 5 #rolling period
local_min_vals = df.loc[df['data'] == df['data'].rolling(n, center=True).min()]
local_max_vals = df.loc[df['data'] == df['data'].rolling(n, center=True).max()]
plt.scatter(local_min_vals.index, local_min_vals, c='r')
plt.scatter(local_max_vals.index, local_max_vals, c='g')

Categories