Distribution plot is showing flat pdf - python

I tried to plot the Probability Density Function (PDF) plot of my data after finding the best parameters, but the plot is showing a flat line instead of a curve.
Is it a matter of scaling?
Is it a problem of Continuous or Discrete data? Data file is available here
The purpose here is to get the best distribution fittings and then plot PDF function.
My data values are so small like: 0.21, 1.117 .etc. The data statistics and PDF plots are shown below:
My script is given below:
from time import time
from datetime import datetime
start_time = datetime.now()
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
import numpy as np
import pickle
import scipy
import scipy.stats
import matplotlib.pyplot as plt
data= pd.read_csv("line_RXC_data.csv",usecols=['R'],parse_dates=True, squeeze=True)
# del yy
import warnings
# Create an index array (x) for data
x = np.arange(len(y))
size = len(y)
#simple visualisation of the data
plt.title("Histogram of resistance ")
plt.xlabel("Resistance data visualization ")
y_df = pd.DataFrame(y)
dist_names = [
x = np.arange(len(df))
size = len(df)
y_std = df
chi_square = []
p_values = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
s1 = time()
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
# print("1")
param = dist.fit(y_std)
# print("2")
# Obtain the KS test P statistic, round it to 5 decimal places
p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
# print("3")
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
# print("4")
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
print(f"chi_square {distribution} time: {time() - s1}")
# print("std of predicted probability : ", np.std(cum_observed_frequency))
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)
# Report results
print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)
# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 100
bin_cutoffs = np.linspace(np.percentile(y,0), np.percentile(y,99),number_of_bins)
# Create the plot
plt.figure(figsize=(7, 4))
h = plt.hist(y, bins = bin_cutoffs, color='0.70')
# Get the top three distributions from the previous phase
number_distributions_to_plot = 5
dist_names = results['Distribution'].iloc[0:number_distributions_to_plot]
# Create an empty list to stroe fitted distribution parameters
parameters = []
# Loop through the distributions ot get line fit and paraemters
for dist_name in dist_names:
# Set up distribution and store distribution paraemters
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y)
# Get line for each distribution (and scale to match observed data)
pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
scale_pdf = np.trapz (h[0], h[1][:-1]) / np.trapz (pdf_fitted, x)
pdf_fitted *= scale_pdf
# Add the line to the plot
plt.plot(pdf_fitted, label=dist_name)
# Set the plot x axis to contain 99% of the data
# This can be removed, but sometimes outlier data makes the plot less clear
# Add legend and display plotfig = plt.figure(figsize=(8,5))
plt.title(u'Data distribution charateristics) \n' )
plt.ylabel('Frequency )')
# Store distribution paraemters in a dataframe (this could also be saved)
dist_parameters = pd.DataFrame()
dist_parameters['Distribution'] = (
dist_parameters['Distribution parameters'] = parameters
# Print parameter results
print ('\nDistribution parameters:')
print ('------------------------')
for index, row in dist_parameters.iterrows():
print ('\nDistribution:', row[0])
print ('Parameters:', row[1] )

If you look at the following categorical frequency analysis, you'll see that you have only 15 distinct values spread across the range with large gaps in between—not a continuum of values. Half the observations have the value 0.211, with another ~36% occurring at the value 1.117, ~8% at 0.194, and ~4% at 0.001. I think it's a mistake to treat this as continuous data.


Does the resolution of a timeseries affect the estimate of the power-spectrum when using np.fft.rfft and does it comply with parseval's theorem?

I have been trying to estimate the power spectrum of a timeseries using fourier transform. I have tried to do this using two variations of the spectral density estimate using np.fft.rfft. The two functions are the following:
def TracePSD_1st(x, dt):
Estimate Power spectral density:
u : timeseries, np.array
dt: 1/sampling frequency
B_pow = np.abs(np.fft.rfft(x, norm='ortho'))**2
freqs = np.fft.rfftfreq(len(x), dt)
freqs = freqs[freqs>0]
idx = np.argsort(freqs)
return freqs[idx], B_pow[idx]
def TracePSD_2nd(x, dt):
Estimate Power spectral density:
u : timeseries, np.array
dt: 1/sampling frequency
N = len(x)
yf = np.fft.rfft(x)
B_pow = abs(yf) ** 2 / N * dt
freqs = np.fft.fftfreq(len(x), dt)
freqs = freqs[freqs>0]
idx = np.argsort(freqs)
return freqs[idx], B_pow[idx]
The issue arrises when I try to downsample my original timeseries and re-estimate the spectrum. The first method gives a different PSD depending on the resolution while the second one gives a pretty similar result.
The results I am getting when using these two functions are shown below:
The weird thing is that the PSD estimated using the first method is in rough accordance with Parseval's theorem while the second one is not.
Any suggestions of what the correct method is? Or an improved version is needed?
I append here a piece of code to reproduce the figures I just showed using a timeseries corresponding to fractional brownian motion ( you will need to pip install fbm)
from fbm import fbm
# create a sythetic timeseries using a fractional brownian motion !( In case you don't have fbm-> pip install fbm)
start_time = datetime.datetime.now()
# Create index for timeseries
end_time = datetime.datetime.now()+ pd.Timedelta('1H')
freq = '10ms'
index = pd.date_range(
start = start_time,
end = end_time,
freq = freq
# Generate a fBm realization
fbm_sample = fbm(n=len(index), hurst=0.75, length=1, method='daviesharte')
# Create a dataframe to resample the timeseries.
df_b = pd.DataFrame({'DateTime': index, 'Br':fbm_sample[:-1]}).set_index('DateTime')
#Original version of timeseries
y = df_b.Br
# Resample the synthetic timeseries
x = df_b.Br.resample(str(int(resolution))+"ms").mean()
# Estimate the sampling rate
dtx = (x.dropna().index.to_series().diff()/np.timedelta64(1, 's')).median()
dty = (y.dropna().index.to_series().diff()/np.timedelta64(1, 's')).median()
# Estimate PSD using first method
resy = TracePSD_1st(y, dty)
resx = TracePSD_1st(x, dtx)
# Estimate PSD using second method
resya = TracePSD_2nd(y, dty)
resxa = TracePSD_2nd(x, dtx)
fig, ax =plt.subplots(1, 3, figsize=(30,10), sharex=True, sharey=True )
ax[0].loglog(resy[0], resy[1], label ='Original timeseries, 1st method')
ax[0].loglog(resx[0], resx[1], label ='Downsampled timeseries, 1st method')
ax[0].text(5*1e-4, 1e-8, r'$\frac{Power_{Real}}{Power_{Fourier}}$ = '+ str(round(sum(abs(y**2))/ sum(abs(resy[1])) ,2)), fontsize =20)
y = df_b.Br
x = df_b.Br.resample(str(int(resolution))+"ms").mean()
dtx = (x.dropna().index.to_series().diff()/np.timedelta64(1, 's')).median()
dty = (y.dropna().index.to_series().diff()/np.timedelta64(1, 's')).median()
ax[1].loglog(resya[0], resya[1], label ='Original timeseries, 2nd method')
ax[1].loglog(resxa[0], resxa[1], label ='Downsampled timeseries, 2nd method')
ax[1].text(5*1e-4, 1e-8, r'$\frac{Power_{Real}}{Power_{Fourier}}$ = '+ str(round(sum(abs(y**2))/ sum(abs(resya[1])) ,2)), fontsize =20)
ax[2].loglog(resy[0], resy[1], label ='Original timeseries, 1st method')
ax[2].loglog(resya[0], resya[1], label ='Original timeseries, 2nd method')
for i in range(3):
ax[i].set_xlabel(r'$Frequency \ [Hz]$')

Update a python function for a 2d array usage

I want to update python data which was originally created for a 1d array to process data. I tried different ways but still got errors. if I flatten my 2d data the data loses meaning sing it is voice data. Below is a made-up data and the function to reproduce the error.
x = np.random.normal(0,1,(40,2))
print(cpp_function(x=signal, fs=44100, pitch_range=[75, 300], trendline_quefrency_range=[0.001, 0.05]))
def cpp_function(x, fs, pitch_range, trendline_quefrency_range, smooth=False, time_smooth_len=None, quefrency_smooth_len=None):
Computes cepstral peak prominence for a given signal
x: ndarray
The audio signal
fs: integer
The sampling frequency
pitch_range: list of 2 elements
The pitch range where a peak is searched for
trendline_quefrency_range: list of 2 elements
The quefrency range for which the amplitudes will be modelled by a straight line
The cepstral peak prominence of the audio signal
# Cepstrum
x = np.hamming(len(x))*x
spectrum = np.fft.rfft(x)
spectrum = 20*np.log10(np.abs(spectrum))
ceps = np.fft.rfft(spectrum)
ceps = 20*np.log10(np.abs(ceps))
# Smoothing
if smooth == True:
def smooth(y, box_pts):
box = np.ones(box_pts)/box_pts
y_smooth = np.convolve(y, box, mode='same')
return y_smooth
ceps = smooth(ceps.T, time_smooth_len).T
ceps = smooth(ceps, quefrency_smooth_len)
# Quefrency
dt = 1/fs
freq_vector = np.fft.rfftfreq(len(x), d=dt)
df = freq_vector[1] - freq_vector[0]
quefrency_vector = np.fft.rfftfreq(2*ceps.size-2, df)
# Selecting part of cepstrum
quefrency_range = [1/pitch_range[1], 1/pitch_range[0]]
index_range = np.where((quefrency_vector >= quefrency_range[0]) & (quefrency_vector <=quefrency_range[1]))
# For trend line
index_range_tl = np.where((quefrency_vector >= trendline_quefrency_range[0]) & (quefrency_vector <=trendline_quefrency_range[1]))
# Linear regression
linear_regressor = LinearRegression()
linear_regressor.fit(quefrency_vector[index_range_tl].reshape(-1, 1), ceps[index_range_tl].reshape(-1, 1))
Y_pred = linear_regressor.predict(quefrency_vector.reshape(-1, 1))
peak_value = np.max(ceps[index_range])
peak_index = np.argmax(ceps[index_range])
cpp = peak_value - Y_pred[index_range][peak_index][0]
return cpp

Getting a smooth Poisson Distribution over a Histogram with Small Number of Bins

I have a Poisson distribution of a background count which mostly contains counts equal to zero, I've fitted a Poisson distribution to this data and gotten the following result:
I have another dataset from a source which has higher count rates, in this case it works fine:
Here's my (inelegant) code in full;
mean_values = []
# obtaining results:
for a in data_arrays:
dataset = globals()[a]
cps_vals = dataset[:,1]
max_cps = int(max(cps_vals))
mean_name = a +"_mean"
std_name = a + "_std"
serr_name = a + "_serr"
mean = globals()[mean_name] = np.mean(cps_vals)
globals()[std_name] = np.std(cps_vals,ddof=1)
globals()[serr_name] = globals()[std_name]/np.sqrt(len(cps_vals)) ## I used globals() so I could call in e.g. the background serr as the variable bg_serr.
print(a,"mean:",globals()[mean_name],"sqrt(mean):",np.sqrt(globals()[mean_name]),"std:",globals()[std_name],"serr:",globals()[serr_name],"sqrt(lambda)/sigma =",np.sqrt(globals()[mean_name])/globals()[std_name])
# plotting with Poisson:
bin_edges = np.arange(0, max_cps+1.1, 1)
histogram = plt.hist(cps_vals,density=True,bins=bin_edges)
plt.xlabel("Counts Per Second")
plt.ylabel("Probability of Occurence")
pops = histogram[0]
bins = histogram[1]
maxidx = np.argmax(pops)
maxpop = pops[maxidx]
maxbin = np.max(bins)
most_populated_bin = bins[maxidx]
plt.plot(np.arange(0, maxbin), poisson.pmf(np.arange(0,maxbin),
This is the relevant line for the Poisson plot:
plt.plot(np.arange(0, maxbin), poisson.pmf(np.arange(0,maxbin), np.mean(cps_vals)),c="black")
If I try to make the np.arange spacing smaller, I get ringing in the Poisson curves:
I think this is because it needs integer values of counts?
How can I produce a smooth Guassian curve for the background count? The one I'm getting doesn't look right.
mu = 15
r = poisson.rvs(mu, size=100000)
plt.hist(r, bins=np.linspace(0, 35, 36), alpha=0.5, label='counting process', ec='black', align='left')
plt.plot(poisson.pmf(np.linspace(0, 35, 36),mu)*100000)

Averaging several time-series together with confidence interval (with test code)

Sounds very complicated but a simple plot will make it easy to understand:
I have three curves of cumulative sum of some values over time, which are the blue lines.
I want to average (or somehow combine in a statistically correct way) the three curves into one smooth curve and add confidence interval.
I tried one simple solution - combining all the data into one curve, average it with the "rolling" function in pandas, getting the standard deviation for it. I plotted those as the purple curve with the confidence interval around it.
The problem with my real data, and as illustrated in the plot above is the curve isn't smooth at all, also there are sharp jumps in the confidence interval which also isn't a good representation of the 3 separate curves as there is no jumps in them.
Is there a better way to represent the 3 different curves in one smooth curve with a nice confidence interval?
I supply a test code, tested on python 3.5.1 with numpy and pandas (don't change the seed in order to get the same curves).
There are some constrains - increasing the number of points for the "rolling" function isn't a solution for me because some of my data is too short for that.
Test code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
## data generation - cumulative analysis over time
df1_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df1_values = pd.DataFrame(np.random.randint(0,10000,size=100), columns=['vals'])
df1_combined_sorted = pd.concat([df1_time, df1_values], axis = 1).sort_values(by=['time'])
df1_combined_sorted_cumulative = np.cumsum(df1_combined_sorted['vals'])
df2_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df2_values = pd.DataFrame(np.random.randint(1000,13000,size=100), columns=['vals'])
df2_combined_sorted = pd.concat([df2_time, df2_values], axis = 1).sort_values(by=['time'])
df2_combined_sorted_cumulative = np.cumsum(df2_combined_sorted['vals'])
df3_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df3_values = pd.DataFrame(np.random.randint(0,4000,size=100), columns=['vals'])
df3_combined_sorted = pd.concat([df3_time, df3_values], axis = 1).sort_values(by=['time'])
df3_combined_sorted_cumulative = np.cumsum(df3_combined_sorted['vals'])
## combining the three curves
df_all_vals_cumulative = pd.concat([df1_combined_sorted_cumulative,.
df2_combined_sorted_cumulative, df3_combined_sorted_cumulative]).reset_index(drop=True)
df_all_time = pd.concat([df1_combined_sorted['time'],
df2_combined_sorted['time'], df3_combined_sorted['time']]).reset_index(drop=True)
df_all = pd.concat([df_all_time, df_all_vals_cumulative], axis = 1)
## creating confidence intervals
df_all_sorted = df_all.sort_values(by=['time'])
ma = df_all_sorted.rolling(10).mean()
mstd = df_all_sorted.rolling(10).std()
## plotting
plt.fill_between(df_all_sorted['time'], ma['vals'] - 2 * mstd['vals'],
ma['vals'] + 2 * mstd['vals'],color='b', alpha=0.2)
plt.plot(df_all_sorted['time'],ma['vals'], c='purple')
plt.plot(df1_combined_sorted['time'], df1_combined_sorted_cumulative, c='blue')
plt.plot(df2_combined_sorted['time'], df2_combined_sorted_cumulative, c='blue')
plt.plot(df3_combined_sorted['time'], df3_combined_sorted_cumulative, c='blue')
First of all, your sample code could be re-written to make better use of pd. For example
## data generation - cumulative analysis over time
def get_data(max_val, max_time=1000):
times = pd.DataFrame(np.random.uniform(0,max_time,size=50), columns=['time'])
vals = pd.DataFrame(np.random.randint(0,max_val,size=100), columns=['vals'])
df = pd.concat([times, vals], axis = 1).sort_values(by=['time']).\
reset_index().drop('index', axis=1)
df['cumulative'] = df.vals.cumsum()
return df
# generate the dataframes
df1,df2,df3 = (df for df in map(get_data, [10000, 13000, 4000]))
dfs = (df1, df2, df3)
# join
df_all = pd.concat(dfs, ignore_index=True).sort_values(by=['time'])
# render function
def render(window=10):
# compute rolling means and confident intervals
mean_val = df_all.cumulative.rolling(window).mean()
std_val = df_all.cumulative.rolling(window).std()
min_val = mean_val - 2*std_val
max_val = mean_val + 2*std_val
for df in dfs:
plt.plot(df.time, df.cumulative, c='blue')
plt.plot(df_all.time, mean_val, c='r')
plt.fill_between(df_all.time, min_val, max_val, color='blue', alpha=.2)
The reason your curves aren't that smooth is maybe your rolling window is not large enough. You can increase this window size to get smoother graphs. For example render(20) gives:
while render(30) gives:
Although, the better way might be imputing each of df['cumulative'] to the entire time window and compute the mean/confidence interval on these series. With that in mind, we can modify the code as follows:
## data generation - cumulative analysis over time
def get_data(max_val, max_time=1000):
times = pd.DataFrame(np.random.uniform(0,max_time,size=50), columns=['time'])
vals = pd.DataFrame(np.random.randint(0,max_val,size=100), columns=['vals'])
# note that we set time as index of the returned data
df = pd.concat([times, vals], axis = 1).dropna().set_index('time').sort_index()
df['cumulative'] = df.vals.cumsum()
return df
df1,df2,df3 = (df for df in map(get_data, [10000, 13000, 4000]))
dfs = (df1, df2, df3)
# rename column for later plotting
for i,df in zip(range(3),dfs):
df.rename(columns={'cumulative':f'cummulative_{i}'}, inplace=True)
# concatenate the dataframes with common time index
df_all = pd.concat(dfs,sort=False).sort_index()
# interpolate each cumulative column linearly
# plot graphs
mean_val = df_all.iloc[:,1:].mean(axis=1)
std_val = df_all.iloc[:,1:].std(axis=1)
min_val = mean_val - 2*std_val
max_val = mean_val + 2*std_val
fig, ax = plt.subplots(1,1,figsize=(16,9))
plt.plot(df_all.index, mean_val, c='purple')
plt.fill_between(df_all.index, min_val, max_val, color='blue', alpha=.2)
and we get:

how to isolate data that are 2 and 3 sigma deviated from mean and then mark them in a plot in python?

I am reading from a dataset which looks like the following when plotted in matplotlib and then taken the best fit curve using linear regression.
The sample of data looks like following:
# ID X Y px py pz M R
1.04826492772e-05 1.04828050287e-05 1.048233088e-05 0.000107002791008 0.000106552433081 0.000108704469007 387.02 4.81947797625e+13
1.87380963036e-05 1.87370588085e-05 1.87372620448e-05 0.000121616280029 0.000151924707761 0.00012371156585 428.77 6.54636174067e+13
3.95579877816e-05 3.95603773653e-05 3.95610756809e-05 0.000163470663023 0.000265203868883 0.000228031803626 470.74 8.66961875758e+13
My code looks the following:
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = sp.stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = sp.polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = M, Y_z # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
However I can see a lot upward scatter in the data and the best fit curve is affected by those. So first I want to isolate the data points which are 2 and 3 sigma away from my mean data, and mark them with circle around them.
Then take the best fit curve considering only the points which fall within 1 sigma of my mean data
Is there a good function in python which can do that for me?
Also in addition to that may I also isolate the data from my actual dataset, like if the third row in the sample input represents 2 sigma deviation may I have that row as an output too to save later and investigate more?
Your help is most appreciated.
Here's some code that goes through the data in a given number of windows, calculates statistics in said windows, and separates data in well- and misbehaved lists.
Hope this helps.
from scipy import stats
from scipy import polyval
import numpy as np
import matplotlib.pyplot as plt
num_data = 10000
fake_data_x = np.sort(12.8+np.random.random(num_data))
fake_data_y = np.exp(fake_data_x) + np.random.normal(0,scale=50000,size=num_data)
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = fake_data_x, fake_data_y # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
num_bin_intervals = 10 # approx number of averaging windows
window_boundaries = np.linspace(min(fake_data_x),max(fake_data_x),int(len(fake_data_x)/num_bin_intervals)) # window boundaries
y_good = [] # list to collect the "well-behaved" y-axis data
x_good = [] # list to collect the "well-behaved" x-axis data
y_outlier = []
x_outlier = []
for i in range(len(window_boundaries)-1):
# create a boolean mask to select the data within the averaging window
window_indices = (fake_data_x<=window_boundaries[i+1]) & (fake_data_x>window_boundaries[i])
# separate the pieces of data in the window
fake_data_x_slice = fake_data_x[window_indices]
fake_data_y_slice = fake_data_y[window_indices]
# calculate the mean y_value in the window
y_mean = np.mean(fake_data_y_slice)
y_std = np.std(fake_data_y_slice)
# choose and select the outliers
y_outliers = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
x_outliers = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
# choose and select the good ones
y_goodies = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
x_goodies = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
# extend the lists with all the good and the bad
