How can I do a histogram with 1D gaussian mixture with sklearn? - python

I would like to do an histogram with mixture 1D gaussian as the picture.
Thanks Meng for the picture.
My histogram is this:
I have a file with a lot of data (4,000,000 of numbers) in a column:
1.727182
1.645300
1.619943
1.709263
1.614427
1.522313
And I'm using the follow script with modifications than Meng and Justice Lord have done :
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
x = open("prueba.dat").read().splitlines()
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
plt.plot(f,weights[0]*stats.norm.pdf(f,means[0],np.sqrt(covars[0])), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()
And when I run the script, I have the follow plot:
So, I don't have idea how put the start and end of all gaussians that must be there. I'm new in python and I'm confuse with the way to use the modules. Please, Can you help me and guide me how can I do this plot?
Thanks a lot

Although this is a reasonably old thread, I would like to provide my take on it. I believe my answer can be more comprehensible to some. Moreover, I include a test to check whether or not the desired number of components makes statistical sense via the BIC criterion.
# import libraries (some are for cosmetics)
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
import astropy
from scipy.stats import norm
from sklearn.mixture import GaussianMixture as GMM
import matplotlib as mpl
mpl.rcParams['axes.linewidth'] = 1.5
mpl.rcParams.update({'font.size': 15, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# create the data as in #Meng's answer
x = np.concatenate((np.random.normal(5, 5, 1000), np.random.normal(10, 2, 1000)))
x = x.reshape(-1, 1)
# first of all, let's confirm the optimal number of components
bics = []
min_bic = 0
counter=1
for i in range (10): # test the AIC/BIC metric between 1 and 10 components
gmm = GMM(n_components = counter, max_iter=1000, random_state=0, covariance_type = 'full')
labels = gmm.fit(x).predict(x)
bic = gmm.bic(x)
bics.append(bic)
if bic < min_bic or min_bic == 0:
min_bic = bic
opt_bic = counter
counter = counter + 1
# plot the evolution of BIC/AIC with the number of components
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1,2,1)
# Plot 1
plt.plot(np.arange(1,11), bics, 'o-', lw=3, c='black', label='BIC')
plt.legend(frameon=False, fontsize=15)
plt.xlabel('Number of components', fontsize=20)
plt.ylabel('Information criterion', fontsize=20)
plt.xticks(np.arange(0,11, 2))
plt.title('Opt. components = '+str(opt_bic), fontsize=20)
# Since the optimal value is n=2 according to both BIC and AIC, let's write down:
n_optimal = opt_bic
# create GMM model object
gmm = GMM(n_components = n_optimal, max_iter=1000, random_state=10, covariance_type = 'full')
# find useful parameters
mean = gmm.fit(x).means_
covs = gmm.fit(x).covariances_
weights = gmm.fit(x).weights_
# create necessary things to plot
x_axis = np.arange(-20, 30, 0.1)
y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian
ax = fig.add_subplot(1,2,2)
# Plot 2
plt.hist(x, density=True, color='black', bins=np.arange(-100, 100, 1))
plt.plot(x_axis, y_axis0, lw=3, c='C0')
plt.plot(x_axis, y_axis1, lw=3, c='C1')
plt.plot(x_axis, y_axis0+y_axis1, lw=3, c='C2', ls='dashed')
plt.xlim(-10, 20)
#plt.ylim(0.0, 2.0)
plt.xlabel(r"X", fontsize=20)
plt.ylabel(r"Density", fontsize=20)
plt.subplots_adjust(wspace=0.3)
plt.show()
plt.close('all')

It's all about reshape.
First, you need to reshape f.
For pdf, reshape before using stats.norm.pdf. Similarly, sort and reshape before plotting.
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
# x = open("prueba.dat").read().splitlines()
# create the data
x = np.concatenate((np.random.normal(5, 5, 1000),np.random.normal(10, 2, 1000)))
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
f_axis = f.copy().ravel()
f_axis.sort()
plt.plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0],np.sqrt(covars[0])).ravel(), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()

Related

Plotting a gaussian fit to a histgram with displot or histplot

I've decided to give seaborn version 0.11.0 a go! Playing around with the displot function, which will replace distplot, as I understand it. I'm just trying to figure out how to plot a gaussian fit on to a histogram. Here's some example code.
import seaborn as sns
import numpy as np
x = np.random.normal(size=500) * 0.1
With distplot I could do:
sns.distplot(x, kde=False, fit=norm)
But how to go about it in displot or histplot?
So far the closest I've come to is:
sns.histplot(x,stat="probability", bins=30, kde=True, kde_kws={"bw_adjust":3})
But I think this just increases the smoothening of the plotted kde, which isn't exactly what I'm going for.
I really miss the fit parameter too. It doesn't appear they replaced that functionality when they deprecated the distplot function. Until they plug that hole, I created a short function to add the normal distribution overlay to my histplot. I just paste the function at the top of a file along with the imports, and then I just have to add one line to add the overlay when I want it.
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
def normal(mean, std, color="black"):
x = np.linspace(mean-4*std, mean+4*std, 200)
p = stats.norm.pdf(x, mean, std)
z = plt.plot(x, p, color, linewidth=2)
data = np.random.normal(size=500) * 0.1
ax = sns.histplot(x=data, stat="density")
normal(data.mean(), data.std())
If you would rather use stat="probability" instead of stat="density", you can normalize the fit curve with something like this:
def normal(mean, std, histmax=False, color="black"):
x = np.linspace(mean-4*std, mean+4*std, 200)
p = stats.norm.pdf(x, mean, std)
if histmax:
p = p*histmax/max(p)
z = plt.plot(x, p, color, linewidth=2)
data = np.random.normal(size=500) * 0.1
ax = sns.histplot(x=data, stat="probability")
normal(data.mean(), data.std(), histmax=ax.get_ylim()[1])
Sorry I am late to the party. Just check if this will meet your requirement.
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
data = np.random.normal(size=500) * 0.1
mu, std = norm.fit(data)
# Plot the histogram.
plt.hist(data, bins=25, density=True, alpha=0.6, color='g')
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
plt.show()

plot data as linear model in 3D with matplotlib

i want to plot some data like in the link below.
What can i do when i have more than "TV & Radio" in the OLS formula and i only want to plot these two with "Sales"? Because if i do like in the code below (link), it shows me an error that the others are not defined (except TV & Radio)..
thanks for help!
https://stackoverflow.com/a/26434204/14208684
Here is the code of the link:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from matplotlib import cm
csv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
model = sm.ols(formula='Sales ~ TV + Radio', data = csv)
fit = model.fit()
fit.summary()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_surf = np.arange(0, 350, 20) # generate a mesh
y_surf = np.arange(0, 60, 4)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)
exog = pd.core.frame.DataFrame({'TV': x_surf.ravel(), 'Radio': y_surf.ravel()})
out = fit.predict(exog = exog)
ax.plot_surface(x_surf, y_surf,
out.reshape(x_surf.shape),
rstride=1,
cstride=1,
color='None',
alpha = 0.4)
ax.scatter(csv['TV'], csv['Radio'], csv['Sales'],
c='blue',
marker='o',
alpha=1)
ax.set_xlabel('TV')
ax.set_ylabel('Radio')
ax.set_zlabel('Sales')
plt.show()

Trying to interpolate the output of a histogram function in Python

What I am trying to do is to play around with some random distribution. I don't want it to be normal. But for the time being normal is easier.
import matplotlib.pyplot as plt
from scipy.stats import norm
ws=norm.rvs(4.0, 1.5, size=100)
density, bins = np.histogram(ws, 50,normed=True, density=True)
unity_density = density / density.sum()
fig, ((ax1, ax2)) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(12,6))
widths = bins[:-1] - bins[1:]
ax1.bar(bins[1:], unity_density, width=widths)
ax2.bar(bins[1:], unity_density.cumsum(), width=widths)
fig.tight_layout()
Then what I can do it visualize CDF in terms of points.
density1=unity_density.cumsum()
x=bins[:-1]
y=density1
plt.plot(x, density1, 'o')
So what I have been trying to do is to use the np.interp function on the output of np.histogram in order to obtain a smooth curve representing the CDF and extracting the percent points to plot them. Ideally, I need to try to do it all both manually and using ppf function from scipy.
I have always struggled with statistics as an undergraduate. I am in grad school now and try to put me through as many exercises like this as possible in order to get a deeper understanding of what is happening. I've reached a point of desperation with this task.
Thank you!
One possibility to get smoother results is to use more samples, by using 10^5 samples and 100 bins I get the following images:
ws = norm.rvs(loc=4.0, scale=1.5, size=100000)
density, bins = np.histogram(ws, bins=100, normed=True, density=True)
In general you could use scipys interpolation module to smooth your CDF.
For 100 samples and a smoothing factor of s=0.01 I get:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import splev, splrep
density1 = unity_density.cumsum()
x = bins[:-1]
y = density1
# Interpolation
spl = splrep(x, y, s=0.01, per=False)
x2 = np.linspace(x[0], x[-1], 200)
y2 = splev(x2, spl)
# Plotting
fig, ax = plt.subplots()
plt.plot(x, density1, 'o')
plt.plot(x2, y2, 'r-')
The third possibility is to calculate the CDF analytically. If you generate the noise yourself with a numpy / scipy function most of the time there is already an implementation of the CDF available, otherwise you should find it on Wikipedia. If your samples come from measurements that is of course a different story.
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
x = np.linspace(-2, 10)
y = norm(loc=4.0, scale=1.5).cdf(x)
ax.plot(x, y, 'bo-')

PLS-DA Loading Plot in Python

How can I make a Loading plot with Matplotlib of a PLS-DA plot, like the loading plot like that of PCA?
This answer explains how it can be done with PCA:
Plot PCA loadings and loading in biplot in sklearn (like R's autoplot)
However there are some significant differences between the two methods which makes the implementation different as well. (Some of the relevant differences are explained here https://learnche.org/pid/latent-variable-modelling/projection-to-latent-structures/interpreting-pls-scores-and-loadings )
To make the PLS-DA plot I use the following code:
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
import numpy as np
import pandas as pd
targets = [0, 1]
x_vals = StandardScaler().fit_transform(df.values)
y = [g == targets[0] for g in sample_description]
y = np.array(y, dtype=int)
plsr = PLSRegression(n_components=2, scale=False)
plsr.fit(x_vals, y)
colormap = {
targets[0]: '#ff0000', # Red
targets[1]: '#0000ff', # Blue
}
colorlist = [colormap[c] for c in sample_description]
scores = pd.DataFrame(plsr.x_scores_)
scores.index = x.index
x_loadings = plsr.x_loadings_
y_loadings = plsr.y_loadings_
fig1, ax = get_default_fig_ax('Scores on LV 1', 'Scores on LV 2', title)
ax = scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=colorlist, ax=ax)
I took your code and enhanced it. The biplot is obtained via simply overlaying the score and the loading plot.
Other, more rigerous plots could be made with truely shared axis according to https://blogs.sas.com/content/iml/2019/11/06/what-are-biplots.html#:~:text=A%20biplot%20is%20an%20overlay,them%20on%20a%20single%20plot.
The code below generates this image for a dataset with ~200 features (therefore there are ~200 red arrows shown):
from sklearn.cross_decomposition import PLSRegression
pls2 = PLSRegression(n_components=2)
pls2.fit(X_train, Y_train)
x_loadings = pls2.x_loadings_
y_loadings = pls2.y_loadings_
fig, ax = plt.subplots(constrained_layout=True)
scores = pd.DataFrame(pls2.x_scores_)
scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=Y_train.values[:,0], ax = ax)
newax = fig.add_axes(ax.get_position(), frameon=False)
feature_n=x_loadings.shape[0]
print(x_loadings.shape)
for feature_i in range(feature_n):
comp_1_idx=0
comp_2_idx=1
newax.arrow(0, 0, x_loadings[feature_i,comp_1_idx], x_loadings[feature_i,comp_2_idx],color = 'r',alpha = 0.5)
newax.get_xaxis().set_visible(False)
newax.get_yaxis().set_visible(False)
plt.show()

Python - Matplotlib: normalize axis when plotting a Probability Density Function

I'm using Python and some of its extensions to get and plot the Probability Density Function. While I manage to plot it, in its form, at least, I don't manage to succeed on scalating the axis.
import decimal
import numpy as np
import scipy.stats as stats
import pylab as pl
import matplotlib.pyplot as plt
from decimal import *
from scipy.stats import norm
lines=[]
fig, ax = plt.subplots(1, 1)
mean, var, skew, kurt = norm.stats(moments='mvsk')
#Here I delete some lines aimed to fill the list with values
Long = len(lines)
Maxim = max(lines) #MaxValue
Minim = min(lines) #MinValue
av = np.mean(lines) #Average
StDev = np.std(lines) #Standard Dev.
x = np.linspace(Minim, Maxim, Long)
ax.plot(x, norm.pdf(x, av, StDev),'r-', lw=3, alpha=0.9, label='norm pdf')
weights = np.ones_like(lines)/len(lines)
ax.hist(lines, weights = weights, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
The result is
While I would like to have it expressed
- In the x-axis centered in 0 and related to the standard deviation
- In the y-axis, related to the histogram and the %s (normalized to 1)
For the x-axis as the image below
And like this last image for the y-axis
I've managed to escalate the y-axis in a histogram by plotting it individually with the instruction weights = weights and setting it into the plot, but I can't do it here. I include it in the code but actually it does nothing in this case.
Any help would be appreciated
the y-axis is normed in a way, that the area under the curve is one.
And adding equal weights for every data point makes no sense if you normalize anyway with normed=True.
first you need to shift your data to 0:
lines -= mean(lines)
then plot it.
ythis should be a working minimal example:
import numpy as np
from numpy.random import normal
import matplotlib.pyplot as plt
from scipy.stats import norm
# gaussian distributed random numbers with mu =4 and sigma=2
x = normal(4, 2, 10000)
mean = np.mean(x)
sigma = np.std(x)
x -= mean
x_plot = np.linspace(min(x), max(x), 1000)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.hist(x, bins=50, normed=True, label="data")
ax.plot(x_plot, norm.pdf(x_plot, mean, sigma), 'r-', label="pdf")
ax.legend(loc='best')
x_ticks = np.arange(-4*sigma, 4.1*sigma, sigma)
x_labels = [r"${} \sigma$".format(i) for i in range(-4,5)]
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_labels)
plt.show()
output image is this:
and you have too much imports.
you import decimals twice, one time even with *
and then numpy, pyplot and scipy are included in pylab. Also why import the whole scipy.stats and then again import just norm from it?

Categories