Less noisy graph and extra humps in python - python

Here is the data file:
https://jsfiddle.net/83ygso6u/
Sorry for posting it in jsfiddle... didn't know where else to host it.
Anyway the second column should be ignored.
Here is the code and graph:
import pylab as plb
import math
from pylab import *
import matplotlib.pyplot as plt
data = plb.loadtxt('title_of_datafile.txt')
x = data[:,0]*1000
y= data[:,2]
plt.figure()
plt.title('Some_Title',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('dBu',fontsize=30)
plt.plot(x,y,'k-', label='Data')
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.show()
So you can see this signal is quite noisy, but it does have two distinct peaks at around 4500 Hz and 5500 Hz.
I have been searching around the net and havn't really come across anything that will help me.
How can I extract these peaks and/or clean up the signal in python?

Well I managed to find a solution. Here is the script with the resulting plot.
Script:
import pylab as plb
import math
from pylab import *
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from scipy import signal
import peakutils
from peakutils.plot import plot as pplot
data = plb.loadtxt('data_file_name')
x = data[:,0]*1000
y= data[:,2]
y1 = sp.signal.medfilt(y,431) # remove noise to the signal
indexes = peakutils.indexes(y1, thres=0.00005, min_dist=1400) #determine peaks
x_new = x[indexes]
plt.figure()
plt.subplot(1,2,1)
plt.title('some_title_1',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('Signal (dBu)',fontsize=30)
plt.plot(x,y,'r-', label='Raw Data')
plt.plot(x,y1,'b-', label='Cleaned up Signal')
plt.plot(x_new[3:6],y1[indexes][3:6],'k^',markersize=10, label='Peaks')
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.legend(loc=1,prop={'size':30})
plt.subplot(1,2,2)
for i,j in zip(x_new[3:6], y1[indexes][3:6]):
plt.annotate(str(i)+ " Hz",xy=(i,j+0.5),fontsize=15)
plt.title('some_title_2',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('Signal (dBu)',fontsize=30)
plt.plot(x,y,'r-', label='Data')
plt.plot(x,y1,'b-')
plt.plot(x_new[3:6],y1[indexes][3:6],'k^',markersize=10)
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.xlim([3000, 6000])
plt.ylim([-90, -75])
plt.subplots_adjust(hspace = 0.6)
plt.show()

Related

Fit of intensity distribution does not work

So im sitting here and don't know how to fit the right function for my Intensity distribution of a doubleslit experiment. I tried so much but I don't know how it works. The x,y data are more than 1000 values.
Here is my Plot:
And here's how it should look like:
And that is my code to that:
import matplotlib.patches as mp
import matplotlib.pyplot as plt
import numpy as np
from scipy import optimize
from scipy.optimize import curve_fit
import pandas as pd
import math
data = pd.read_csv('TEM00-Doppelspalt-Short.txt',sep='\s+',header=None)
data = pd.DataFrame(data)
x = data[1]
y = data[2]
def expf(i0,g,k,y0,d):
return i0*((np.sin(g*(k-y0)))/(g*(k-y0)))**2*np.cos(d*(k-y0))**2
popt, pcov =curve_fit(expf, x, y, p0 = (13, 20, 2, 4))
g,k,y0,d = popt
plt.figure(figsize = (8,6), dpi = 600)
plt.xlabel(r'Wavelength [$\mu$m]',fontsize=12)
plt.ylabel('Value [Cnts]', fontsize=12)
plt.plot(x, y,'ko')
plt.plot(x, expf(x,g,k,y0,d))
a_patch=mp.Patch(color='k', label="$TEM_{00}$ Doubleslit ShortMode")
plt.legend(handles=[a_patch],loc="upper left")
plt.show()
Here is my datafile:
Data File of Intensity

plot data as linear model in 3D with matplotlib

i want to plot some data like in the link below.
What can i do when i have more than "TV & Radio" in the OLS formula and i only want to plot these two with "Sales"? Because if i do like in the code below (link), it shows me an error that the others are not defined (except TV & Radio)..
thanks for help!
https://stackoverflow.com/a/26434204/14208684
Here is the code of the link:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from matplotlib import cm
csv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
model = sm.ols(formula='Sales ~ TV + Radio', data = csv)
fit = model.fit()
fit.summary()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_surf = np.arange(0, 350, 20) # generate a mesh
y_surf = np.arange(0, 60, 4)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)
exog = pd.core.frame.DataFrame({'TV': x_surf.ravel(), 'Radio': y_surf.ravel()})
out = fit.predict(exog = exog)
ax.plot_surface(x_surf, y_surf,
out.reshape(x_surf.shape),
rstride=1,
cstride=1,
color='None',
alpha = 0.4)
ax.scatter(csv['TV'], csv['Radio'], csv['Sales'],
c='blue',
marker='o',
alpha=1)
ax.set_xlabel('TV')
ax.set_ylabel('Radio')
ax.set_zlabel('Sales')
plt.show()

How can I do a histogram with 1D gaussian mixture with sklearn?

I would like to do an histogram with mixture 1D gaussian as the picture.
Thanks Meng for the picture.
My histogram is this:
I have a file with a lot of data (4,000,000 of numbers) in a column:
1.727182
1.645300
1.619943
1.709263
1.614427
1.522313
And I'm using the follow script with modifications than Meng and Justice Lord have done :
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
x = open("prueba.dat").read().splitlines()
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
plt.plot(f,weights[0]*stats.norm.pdf(f,means[0],np.sqrt(covars[0])), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()
And when I run the script, I have the follow plot:
So, I don't have idea how put the start and end of all gaussians that must be there. I'm new in python and I'm confuse with the way to use the modules. Please, Can you help me and guide me how can I do this plot?
Thanks a lot
Although this is a reasonably old thread, I would like to provide my take on it. I believe my answer can be more comprehensible to some. Moreover, I include a test to check whether or not the desired number of components makes statistical sense via the BIC criterion.
# import libraries (some are for cosmetics)
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
import astropy
from scipy.stats import norm
from sklearn.mixture import GaussianMixture as GMM
import matplotlib as mpl
mpl.rcParams['axes.linewidth'] = 1.5
mpl.rcParams.update({'font.size': 15, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# create the data as in #Meng's answer
x = np.concatenate((np.random.normal(5, 5, 1000), np.random.normal(10, 2, 1000)))
x = x.reshape(-1, 1)
# first of all, let's confirm the optimal number of components
bics = []
min_bic = 0
counter=1
for i in range (10): # test the AIC/BIC metric between 1 and 10 components
gmm = GMM(n_components = counter, max_iter=1000, random_state=0, covariance_type = 'full')
labels = gmm.fit(x).predict(x)
bic = gmm.bic(x)
bics.append(bic)
if bic < min_bic or min_bic == 0:
min_bic = bic
opt_bic = counter
counter = counter + 1
# plot the evolution of BIC/AIC with the number of components
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1,2,1)
# Plot 1
plt.plot(np.arange(1,11), bics, 'o-', lw=3, c='black', label='BIC')
plt.legend(frameon=False, fontsize=15)
plt.xlabel('Number of components', fontsize=20)
plt.ylabel('Information criterion', fontsize=20)
plt.xticks(np.arange(0,11, 2))
plt.title('Opt. components = '+str(opt_bic), fontsize=20)
# Since the optimal value is n=2 according to both BIC and AIC, let's write down:
n_optimal = opt_bic
# create GMM model object
gmm = GMM(n_components = n_optimal, max_iter=1000, random_state=10, covariance_type = 'full')
# find useful parameters
mean = gmm.fit(x).means_
covs = gmm.fit(x).covariances_
weights = gmm.fit(x).weights_
# create necessary things to plot
x_axis = np.arange(-20, 30, 0.1)
y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian
ax = fig.add_subplot(1,2,2)
# Plot 2
plt.hist(x, density=True, color='black', bins=np.arange(-100, 100, 1))
plt.plot(x_axis, y_axis0, lw=3, c='C0')
plt.plot(x_axis, y_axis1, lw=3, c='C1')
plt.plot(x_axis, y_axis0+y_axis1, lw=3, c='C2', ls='dashed')
plt.xlim(-10, 20)
#plt.ylim(0.0, 2.0)
plt.xlabel(r"X", fontsize=20)
plt.ylabel(r"Density", fontsize=20)
plt.subplots_adjust(wspace=0.3)
plt.show()
plt.close('all')
It's all about reshape.
First, you need to reshape f.
For pdf, reshape before using stats.norm.pdf. Similarly, sort and reshape before plotting.
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
# x = open("prueba.dat").read().splitlines()
# create the data
x = np.concatenate((np.random.normal(5, 5, 1000),np.random.normal(10, 2, 1000)))
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
f_axis = f.copy().ravel()
f_axis.sort()
plt.plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0],np.sqrt(covars[0])).ravel(), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()

Python 2.7: Trouble fitting a Poisson distributed histogram; curve drops off quickly

I have a discrete set of data which looks Poisson distributed between 0 and 90. I'm trying to curve fit the data. My code is the following:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np
from scipy.stats import poisson
from scipy.optimize import curve_fit
from scipy.misc import factorial
plt.figure(figsize = (10, 10))
entries, bin_edges, patches = plt.hist(data, bins = 90, range = [1, 90], normed = True)
bin_middles = 0.5*(bin_edges[1:] + bin_edges[:-1])
def poisson(k, lamb):
return (lamb**k/factorial(k)) * np.exp(-lamb)
params, cov = curve_fit(poisson, bin_middles, entries)
x_plot = np.linspace(1, 90, 90)
plt.plot(x_plot, poisson(x_plot, *params), 'b--', lw = 2)
plt.axvline(np.mean(data), linestyle = '--', color = 'g')
plt.ylim(0, max(entries)+max(entries)/4)
plt.xlim([0, max(x_plot)])
I can't identify why the curve is sitting to the left and the values are extremely small. Thanks for any help!
This is not really an answer, but I can't get the picture into the comments. I'm getting a nice fit on some synthetic data with your code (added proper imports):
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.misc import factorial
%matplotlib inline
plt.figure(figsize = (10, 10))
#make up some synthetic data
data = np.random.poisson(3, 5000000)
entries, bin_edges, patches = plt.hist(data, bins = 90, range = [1, 90], normed = True)
bin_middles = 0.5*(bin_edges[1:] + bin_edges[:-1])
def poisson(k, lamb):
return (lamb**k/ factorial(k)) * np.exp(-lamb)
params, cov = curve_fit(poisson, bin_middles, entries)
x_plot = np.linspace(1, 30, 30)
plt.plot(x_plot, poisson(x_plot, *params), 'r', lw = 2)
Your data are definitely not very Poissonian (relationship between peak and width is wrong, plus the very slow dropoff to the right), but I can't find the reason why your plot is so far off, either.

Python - Matplotlib: normalize axis when plotting a Probability Density Function

I'm using Python and some of its extensions to get and plot the Probability Density Function. While I manage to plot it, in its form, at least, I don't manage to succeed on scalating the axis.
import decimal
import numpy as np
import scipy.stats as stats
import pylab as pl
import matplotlib.pyplot as plt
from decimal import *
from scipy.stats import norm
lines=[]
fig, ax = plt.subplots(1, 1)
mean, var, skew, kurt = norm.stats(moments='mvsk')
#Here I delete some lines aimed to fill the list with values
Long = len(lines)
Maxim = max(lines) #MaxValue
Minim = min(lines) #MinValue
av = np.mean(lines) #Average
StDev = np.std(lines) #Standard Dev.
x = np.linspace(Minim, Maxim, Long)
ax.plot(x, norm.pdf(x, av, StDev),'r-', lw=3, alpha=0.9, label='norm pdf')
weights = np.ones_like(lines)/len(lines)
ax.hist(lines, weights = weights, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
The result is
While I would like to have it expressed
- In the x-axis centered in 0 and related to the standard deviation
- In the y-axis, related to the histogram and the %s (normalized to 1)
For the x-axis as the image below
And like this last image for the y-axis
I've managed to escalate the y-axis in a histogram by plotting it individually with the instruction weights = weights and setting it into the plot, but I can't do it here. I include it in the code but actually it does nothing in this case.
Any help would be appreciated
the y-axis is normed in a way, that the area under the curve is one.
And adding equal weights for every data point makes no sense if you normalize anyway with normed=True.
first you need to shift your data to 0:
lines -= mean(lines)
then plot it.
ythis should be a working minimal example:
import numpy as np
from numpy.random import normal
import matplotlib.pyplot as plt
from scipy.stats import norm
# gaussian distributed random numbers with mu =4 and sigma=2
x = normal(4, 2, 10000)
mean = np.mean(x)
sigma = np.std(x)
x -= mean
x_plot = np.linspace(min(x), max(x), 1000)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.hist(x, bins=50, normed=True, label="data")
ax.plot(x_plot, norm.pdf(x_plot, mean, sigma), 'r-', label="pdf")
ax.legend(loc='best')
x_ticks = np.arange(-4*sigma, 4.1*sigma, sigma)
x_labels = [r"${} \sigma$".format(i) for i in range(-4,5)]
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_labels)
plt.show()
output image is this:
and you have too much imports.
you import decimals twice, one time even with *
and then numpy, pyplot and scipy are included in pylab. Also why import the whole scipy.stats and then again import just norm from it?

Categories