I am trying to gauss fit my data using scipy and curve fit, here is my code :
import csv
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
A=[]
T=[]
seuil=1000
range_gauss=4
a=0
pos_peaks=[]
amp_peaks=[]
A_gauss=[]
T_gauss=[]
new_A=[]
new_T=[]
def gauss(x,a,x0,sigma):
return a*np.exp(-(x-x0)**2/(2*sigma**2))
with open("classeur_test.csv",'r') as csvfile:
reader=csv.reader(csvfile, delimiter=',')
for row in reader :
A.append(float(row[0]))
T.append(float(row[1]))
npA=np.array(A)
npT=np.array(T)
for i in range(1,len(T)):
#PEAK DETECTION
if (A[i]>A[i-1] and A[i]>A[i+1]) and A[i]>seuil:
pos_peaks.append(i)
amp_peaks.append(A[i])
#GAUSSIAN RANGE
for j in range(-range_gauss,range_gauss):
#ATTENTION AUX LIMITES
if(i+j>0 and i+j<len(T)-1):
A_gauss.append(A[i+j])
T_gauss.append(T[i+j])
npA_gauss = np.array(A_gauss)
npT_gauss = np.array(T_gauss)
for i in range (0,7):
new_A.append(npA_gauss[i])
new_T.append(npT_gauss[i])
new_npA=np.array(new_A)
new_npT=np.array(new_T)
n = 2*range_gauss
mean = sum(new_npT*new_npA)/n
sigma = sum(new_npA*(new_npT-mean)**2)/n
popt,pcov = curve_fit(gauss,new_npT,new_npA,p0=[1,mean,sigma])
plt.plot(T,A,'b+:',label='data')
plt.plot(new_npT,gauss(new_npT,*popt),'ro:',label='Fit')
print ("new_npA : ",new_npA)
print ("new_npT : ",new_npT)
plt.legend()
plt.title('Fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
My arrays new_npT and new_npA are numpy arrays like this :
new_npA : [ 264. 478. 733. 1402. 1337. 698. 320.]
new_npT : [229.609344 231.619385 233.62944 235.639496 237.649536 239.659592
241.669647]
This is the result
I don't understand why I can't successfully plot the gauss curves...
Any explanations?
I can now fit gaussians curves on my data
I still can't understand how Jannick found the p0 for the curve fit, but it works.
I created a 3 dimensional array with positions and amplitudes of peaks and used a while loop for the rang_gauss. I used the scipy curve_fit properly with my 3D array, and corrected the amplitudes with a coefficient f.
import csv
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
seuil=1000 # calculer en fonction du bruit etc ................................
range_gauss=4
A=[]
T=[]
pos_peaks=[]
amp_peaks=[]
indices_peaks=[]
tab_popt=[]
l=[]
gauss_result=[]
tab_w=[]
def gauss1(x,a,x0,sigma):
return a*np.exp(-(x-x0)**2/(2*sigma**2))
def gauss2(x,a,x0,sigma):
return (a/sigma*np.sqrt(2*np.pi))*np.exp(-0.5*((x-x0)/sigma)**2)
#LECTURE DU FICHIER ET INITIALISATION DE TABLEAUX CONTENANT TOUTES LES VALEURS
with open("classeur_test.csv",'r') as csvfile:
reader=csv.reader(csvfile, delimiter=',')
for row in reader :
A.append(float(row[0]))
T.append(float(row[1]))
#PEAK DETECTION
for i in range(1,len(T)):
if (A[i]>A[i-1] and A[i]>A[i+1]) and A[i]>seuil:
pos_peaks.append(T[i])
amp_peaks.append(A[i])
indices_peaks.append(i)
#TABLEAU 3D AVEC LES AMPLITUDES ET TEMPS DE CHAQUE PIC
Tableau=np.zeros((len(pos_peaks),2,2*range_gauss+1))
#POUR CHAQUE PIC
m=0
j=-range_gauss
for i in range(0,len(pos_peaks)):
while(j<range_gauss+1):
#PEAK DETECTION & LIMITS CONSIDERATION
if(pos_peaks[i]+j>=0 and pos_peaks[i]+j<=T[len(T)-1] and m<=2*range_gauss+1 and indices_peaks[i]+j>=0):
Tableau[i,0,m]=(A[indices_peaks[i]+j])
Tableau[i,1,m]=(T[indices_peaks[i]+j])
m=m+1
j=j+1
else :
j=j+1
print("else")
print("1 : ",pos_peaks[i]+j,", m : ",m," , indices_peaks[i]+j : ",indices_peaks[i]+j)
m=0
j=-range_gauss
popt,pcov = curve_fit(gauss2,Tableau[i,1,:],Tableau[i,0,:],p0=[[1400,240,10]])
tab_popt.append(popt)
l.append(np.linspace(T[indices_peaks[i]-range_gauss],T[indices_peaks[i]+range_gauss],50))
gauss_result.append(gauss2(l[i],1,tab_popt[i][1],tab_popt[i][2])*(1))
f= amp_peaks[i]/max(gauss_result[i])
gauss_result[i]=gauss_result[i]*f
#LARGEUR MI HAUTEUR
w=2*np.sqrt(2*np.log(2))*tab_popt[i][2]
tab_w.append(w)
####################################PLOTS
plt.subplot(2,1,1)
plt.plot(T,A,label='data')
plt.axis([T[0]-5,T[len(T)-1]-10,0,max(A)+200])
#plt.plot(Tableau[i,1,:],gauss2(Tableau[i,1,:],*popt),'ro:',label='fit')
plt.subplot(2,1,2)
plt.plot(l[i],gauss_result[i])
plt.axis([T[0]-5,T[len(T)-1]-10,0,max(A)+200])
'''TEST POINTS INFLEXIONS
for j in range(0,len(A)-1):
inflex_points.append((np.diff(np.diff(A[j],n=2),n=2)))
print(inflex_points[j])
for k in range(0,len(inflex_points[j])-1):
if (inflex_points[j][k] < 1 and inflex_points[j][k] > -1):
print("j : ",j)'''
'''TEST INTERNET GRADIENT ???
plt.plot(np.gradient(gauss_result[0]), '+')
spl = UnivariateSpline(np.arange(len(gauss_result[0])), np.gradient(gauss_result[0]), k=5)
spl.set_smoothing_factor(1000)
plt.plot(spl(np.arange(len(gauss_result[0]))), label='Smooth Fct 1e3')
spl.set_smoothing_factor(10000)
plt.plot(spl(np.arange(len(gauss_result[0]))), label='Smooth Fct 1e4')
plt.legend(loc='lower left')
max_idx = np.argmax(spl(np.arange(len(gauss_result[0]))))
plt.vlines(max_idx, -5, 9, linewidth=5, alpha=0.3)
'''
plt.show()
Related
I am trying to fit gaussians on a given dataset.
Here is an example dataset.
I would like to find two reasonable gaussian to fit them.
Thus, I wrote the following code to use GMM.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn.mixture import GaussianMixture
from scipy.stats import norm
def main():
x = np.arange(-28,28,2)
y = np.array([0,1,2,3,4,5,5,5,4,3,1,1,0,0,0,0,0,0,1,2,3,3,3,2,1,0,0,1])
plt.plot(x,y,'ro')
plt.savefig("tmp.png")
plt.clf()
data = np.stack( (x,y), axis=-1)
print(data)
gmm = GaussianMixture(
n_components=2,
covariance_type='spherical',
init_params='random',
).fit(data.reshape(-1,1))
weights = gmm.weights_
means = gmm.means_
cov = gmm.covariances_
print(weights)
print(means)
print(cov)
gd0 = weights[0] * norm.pdf(x, means[0][0], np.sqrt(cov[0]))
gd1 = weights[1] * norm.pdf(x, means[1][0], np.sqrt(cov[1]))
plt.plot(x, gd0)
plt.plot(x, gd1)
plt.savefig("tmp2.png")
if __name__ == "__main__":
main()
Then, I get the result.
But it seems strange. I expected the peaks of Gaussians appeared around -15 and +15.
Where did I make a mistake?
weights
[0.52290556 0.47709444]
means
[[-0.78959748]
[ 1.68885233]]
cov
[250.29609772 3.07633333]
I'm trying to build a correlation circle, basically, it allows to measure to which extend the Eigenvalue / Eigenvector of a variable is correlated to the principal components (dimensions) of a dataset.
Something like this :
Here is my code :
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
#chargement données
X = pd.read_excel("mortalitePaysUE.xlsx",sheet_name=0,header=0,index_col=0)
#nombre d'observations
n = X.shape[0]
#nombre de variable
p = X.shape[0]
print(p)
#transformation - centrage/réduction
sc = StandardScaler()
Z = sc.fit_transform(X)
print(Z)
print("-------------")
#moyenne
print("Moyenne : ")
print(np.mean(X,axis=0))
print("-------------")
#ecart-type
print("Ecart type : ")
print(np.std(X,axis=1,ddof=0))
print("-------------")
#acp
acp = PCA(svd_solver='full')
coord = acp.fit_transform(Z)
eigval = (n-1)/n*acp.explained_variance_
print(eigval)
#screen plot
#plt.plot(np.arange(1,p+1),eigval)
#plt.title("Décès en 1990 selon le genre")
#plt.xlabel("Numéro de facteur")
#plt.ylabel("Valeur propre")
#plt.show()
#positionnement des individus dans le premier plan
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-6,6) #même limites en abscisse
axes.set_ylim(-6,6) #et en ordonnée
#placement des étiquettes des observations
for i in range(n):
plt.annotate(X.index[i],(coord[i,0],coord[i,1]))
#ajouter les axes
plt.plot([-6,6],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-6,6],color='silver',linestyle='-',linewidth=1)
#affichage
plt.show()
#racine carrée des valeurs propres
sqrt_eigval = np.sqrt(eigval)
#corrélation des variables avec les axes
corvar = np.zeros((p,p))
for k in range(p):
corvar[:,k] = acp.components_[k,:] * sqrt_eigval[k]
#afficher la matrice des corrélations variables x facteurs
#print(corvar)
#cercle des corrélations
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
#affichage des étiquettes (noms des variables)
for j in range(p):
plt.annotate(X.columns[j],(corvar[j,0],corvar[j,1]))
#ajouter les axes
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)
The problem is that i got an error and i can't display the circle. And i can't resolve the error
corvar[:,k] = acp.components_[k,:] * sqrt_eigval[k]
ValueError: could not broadcast input array from shape (3) into shape (28)
Can anyone help me to fix this please :) Thanks in advance !
I have this equation that I want to graph:
C(T) = γT + αT^3
I realize if I factor out "T" it becomes C(T) = T(γ + α*T^2), similar to y = x(b + ax^2).
I had written most of this code for another data set and got this far but the graph I get doesn't seem right. Any suggestions?
import numpy as np
import matplotlib.pyplot as plt
a=np.loadtxt("CvsT.txt", skiprows=1)
T=a[:,0]
C=a[:,1]
size=a.shape
rows=size[0]
xsum=0
x2sum=0
ysum=0
y2sum=0
xysum=0
for n in range(rows):
xsum=xsum+T[n]
x2sum=x2sum+pow(T[n],2)
ysum=ysum+C[n]
y2sum=y2sum+pow(C[n],2)
xysum=xysum+T[n]*C[n]
xsum=xsum/rows
print(xsum)
x2sum=x2sum/rows
print(x2sum)
ysum=ysum/rows
print(ysum)
y2sum=y2sum/rows
print(y2sum)
xysum=xysum/rows
print(xysum)
a=(xysum-xsum*ysum)/(x2sum-xsum**2)
b=(x2sum*ysum-xsum*xysum)/(x2sum-xsum**2)
print("\nThe regression parameters are: a = ",a, "and b = ", b)
TModel=np.linspace(0.20,0.6,100)
CModel=TModel*(a*TModel+b)
plt.plot(TModel, CModel, color="red", linestyle="dashed")
plt.plot(T,C,markersize=10, color="green", marker="o", linestyle="-")
plt.xlabel("T (K)", fontweight="bold", fontsize="large")
plt.ylabel("C (mJ/mol K)", fontweight="bold", fontsize="large",
color="green")
===========================================================================
T[K] C[mJ/(mol K)
0.244948974 0.538887743
0.264575131 0.595294045
0.316227766 0.727323862
0.331662479 0.769456951
0.360555128 0.865332306
0.374165739 0.927931032
0.424264069 1.107329219
0.447213595 1.185116028
0.479583152 1.294874511
0.509901951 1.417527425
0.547722558 1.588395417
There is an equation of exponential truncated power law in the article below:
Gonzalez, M. C., Hidalgo, C. A., & Barabasi, A. L. (2008). Understanding individual human mobility patterns. Nature, 453(7196), 779-782.
like this:
It is an exponential truncated power law. There are three parameters to be estimated: rg0, beta and K. Now we have got several users' radius of gyration(rg), and uploaded it onto Github: radius of gyrations.txt
The following codes can be used to read data and calculate P(rg):
import numpy as np
# read radius of gyration from file
rg = []
with open('/path-to-the-data/radius of gyrations.txt', 'r') as f:
for i in f:
rg.append(float(i.strip('\n')))
# calculate P(rg)
rg = sorted(rg, reverse=True)
rg = np.array(rg)
prg = np.arange(len(sorted_data)) / float(len(sorted_data)-1)
or you can directly get rg and prg data as the following:
rg = np.array([ 20.7863444 , 9.40547933, 8.70934714, 8.62690145,
7.16978087, 7.02575052, 6.45280959, 6.44755478,
5.16630287, 5.16092884, 5.15618737, 5.05610068,
4.87023561, 4.66753197, 4.41807645, 4.2635671 ,
3.54454372, 2.7087178 , 2.39016885, 1.9483156 ,
1.78393238, 1.75432688, 1.12789787, 1.02098332,
0.92653501, 0.32586582, 0.1514813 , 0.09722761,
0. , 0. ])
prg = np.array([ 0. , 0.03448276, 0.06896552, 0.10344828, 0.13793103,
0.17241379, 0.20689655, 0.24137931, 0.27586207, 0.31034483,
0.34482759, 0.37931034, 0.4137931 , 0.44827586, 0.48275862,
0.51724138, 0.55172414, 0.5862069 , 0.62068966, 0.65517241,
0.68965517, 0.72413793, 0.75862069, 0.79310345, 0.82758621,
0.86206897, 0.89655172, 0.93103448, 0.96551724, 1. ])
I can plot the P(r_g) and r_g using the following python script:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(rg, prg, 'bs', alpha = 0.3)
# roughly estimated params:
# rg0=1.8, beta=0.15, K=5
plt.plot(rg, (rg+1.8)**-.15*np.exp(-rg/5))
plt.yscale('log')
plt.xscale('log')
plt.xlabel('$r_g$', fontsize = 20)
plt.ylabel('$P(r_g)$', fontsize = 20)
plt.show()
How can I use these data of rgs to estimate the three parameters above? I hope to solve it using python.
According to #Michael 's suggestion, we can solve the problem using scipy.optimize.curve_fit
def func(rg, rg0, beta, K):
return (rg + rg0) ** (-beta) * np.exp(-rg / K)
from scipy import optimize
popt, pcov = optimize.curve_fit(func, rg, prg, p0=[1.8, 0.15, 5])
print popt
print pcov
The results are given below:
[ 1.04303608e+03 3.02058550e-03 4.85784945e+00]
[[ 1.38243336e+18 -6.14278286e+11 -1.14784675e+11]
[ -6.14278286e+11 2.72951900e+05 5.10040746e+04]
[ -1.14784675e+11 5.10040746e+04 9.53072925e+03]]
Then we can inspect the results by plotting the fitted curve.
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(rg, prg, 'bs', alpha = 0.3)
plt.plot(rg, (rg+popt[0])**-(popt[1])*np.exp(-rg/popt[2]) )
plt.yscale('log')
plt.xscale('log')
plt.xlabel('$r_g$', fontsize = 20)
plt.ylabel('$P(r_g)$', fontsize = 20)
plt.show()
I am trying to translate the R implementations of gap statistics and prediction strength http://edchedch.wordpress.com/2011/03/19/counting-clusters/ into python scripts for the estimation of number of clusters in iris data with 3 clusters. Instead of getting 3 clusters, I get different results on different runs with 3 (actual number of clusters) hardly estimated. Graph shows estimated number to be 10 instead of 3. Am I missing something? Can anyone help me locate the problem?
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def dispersion (data, k):
if k == 1:
cluster_mean = np.mean(data, axis=0)
distances_from_mean = np.sum((data - cluster_mean)**2,axis=1)
dispersion_val = np.log(sum(distances_from_mean))
else:
k_means_model_ = KMeans(n_clusters=k, max_iter=50, n_init=5).fit(data)
distances_from_mean = range(k)
for i in range(k):
distances_from_mean[i] = int()
for idx, label in enumerate(k_means_model_.labels_):
if i == label:
distances_from_mean[i] += sum((data[idx] - k_means_model_.cluster_centers_[i])**2)
dispersion_val = np.log(sum(distances_from_mean))
return dispersion_val
def reference_dispersion(data, num_clusters, num_reference_bootstraps):
dispersions = [dispersion(generate_uniform_points(data), num_clusters) for i in range(num_reference_bootstraps)]
mean_dispersion = np.mean(dispersions)
stddev_dispersion = float(np.std(dispersions)) / np.sqrt(1. + 1. / num_reference_bootstraps)
return mean_dispersion
def generate_uniform_points(data):
mins = np.argmin(data, axis=0)
maxs = np.argmax(data, axis=0)
num_dimensions = data.shape[1]
num_datapoints = data.shape[0]
reference_data_set = np.zeros((num_datapoints,num_dimensions))
for i in range(num_datapoints):
for j in range(num_dimensions):
reference_data_set[i][j] = random.uniform(data[mins[j]][j],data[maxs[j]][j])
return reference_data_set
def gap_statistic (data, nthCluster, referenceDatasets):
actual_dispersion = dispersion(data, nthCluster)
ref_dispersion = reference_dispersion(data, nthCluster, num_reference_bootstraps)
return actual_dispersion, ref_dispersion
if __name__ == "__main__":
data=np.loadtxt('iris.mat', delimiter=',', dtype=float)
maxClusters = 10
num_reference_bootstraps = 10
dispersion_values = np.zeros((maxClusters,2))
for cluster in range(1, maxClusters+1):
dispersion_values_actual,dispersion_values_reference = gap_statistic(data, cluster, num_reference_bootstraps)
dispersion_values[cluster-1][0] = dispersion_values_actual
dispersion_values[cluster-1][1] = dispersion_values_reference
gaps = dispersion_values[:,1] - dispersion_values[:,0]
print gaps
print "The estimated number of clusters is ", range(maxClusters)[np.argmax(gaps)]+1
plt.plot(range(len(gaps)), gaps)
plt.show()
Your graph is showing the correct value of 3. Let me explain a bit
As you increase the number of clusters, your distance metric will certainly decrease. Therefore you are assuming that the correct value is 10. If you increase it to beyond 10, the distance metric will further decrease. But this should not be our decision making criteria
We need to find the inflection point ( here marked in RED ). It is the point where the slope smoothens out. You might want to take a look at elbow curves
Based on the above 2 points, the inflection point is 3 ( which is also the correct solution )
Hope this helps
you could take a look on this code and you could change your output plot format
[![# coding: utf-8
# Implémentation de K-means clustering python
#Chargement des bibliothèques
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
#chargement de jeu des données Iris
iris = datasets.load_iris()
#importer le jeu de données Iris dataset à l'aide du module pandas
x = pd.DataFrame(iris.data)
x.columns = \['Sepal_Length','Sepal_width','Petal_Length','Petal_width'\]
y = pd.DataFrame(iris.target)
y.columns = \['Targets'\]
#Création d'un objet K-Means avec un regroupement en 3 clusters (groupes)
model=KMeans(n_clusters=3)
#application du modèle sur notre jeu de données Iris
model.fit(x)
#Visualisation des clusters
plt.scatter(x.Petal_Length, x.Petal_width)
plt.show()
colormap=np.array(\['Red','green','blue'\])
#Visualisation du jeu de données sans altération de ce dernier (affichage des fleurs selon leur étiquettes)
plt.scatter(x.Petal_Length, x.Petal_width,c=colormap\[y.Targets\],s=40)
plt.title('Classification réelle')
plt.show()
#Visualisation des clusters formés par K-Means
plt.scatter(x.Petal_Length, x.Petal_width,c=colormap\[model.labels_\],s=40)
plt.title('Classification K-means ')
plt.show()][1]][1]
Output 1