I have generated some data points as a linear mixture of three 1D bell shape Gaussian distributions with different parameters (mean and variance) using following code:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline
from sklearn.mixture import GaussianMixture
x = np.linspace(start=-40,stop=40, num=1000)
y1 = stats.norm.pdf(x, loc=1,scale=1.5) # First Gaussian distribution
y2 = stats.norm.pdf(x, loc=5,scale=2.5) # Second Gaussian distribution
y3 = stats.norm.pdf(x, loc=-15,scale= 10) # Third Gaussian distribution
Y = y1+y2+y3
fig = plt.figure(figsize=(7, 5),dpi=300)
plt.plot(x,y1,lw=2,label='First component')
plt.plot(x,y2,lw=2,label='Second component')
plt.plot(x,y3,lw=2,label='Third component')
plt.plot(x,Y,lw=3,label= 'Linear Mixture')
plt.legend(loc='best',facecolor="white")
plt.show()
I tried to decompose these 3 peaks in a reverse process using sklearn.mixture.GaussianMixture. But it does not return my expected mean and variance of every Gaussian function.
model = GaussianMixture(n_components=3).fit(Y.reshape(-1,1))
print(model.means_)
print(model.covariances_)
Related
Is it possible to find the peak (vertex?) values (x,y) of a polynomial regression line that was computed using Matplotlib?
I've included my basic setup below (of course with fuller data sets), as well as a screenshot of the actual regression line question.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=6
setX={'Fixation Duration': {0:1,1:2,2:3}}
setY={'Fixation Occurrences': {0:1,1:2,2:3}}
X_gall=pd.DataFrame.from_dict(setX)
Y_gall=pd.DataFrame.from_dict(setY)
X_seqGall = np.linspace(X_gall.min(),X_gall.max(),300).reshape(-1,1)
polyregGall=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyregGall.fit(X_gall,Y_gall)
plt.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
plt.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
would like to find x,y values along red arrows
You can find the maximum from the underlying plot data.
First, let's change your plotting commands to explicitly define the axes:
fig, ax = plt.subplots(figsize=(6,4))
_ = ax.scatter(X_gall,Y_gall, c="#1E4174", s=100.0, alpha=0.4)
poly = ax.plot(X_seqGall,polyregGall.predict(X_seqGall),color="#1E4174", linewidth=4)
plt.show()
Now you can access the line data:
lines = poly[0].axes.lines
for line in lines:
max_y = np.max(line.get_ydata())
print(f"Maximum y is: {max_y}")
x_of_max_y = line.get_xdata()[np.argmax(line.get_ydata())]
print(f"x value of maximum y is: {x_of_max_y}")
Output:
Maximum y is: 3.1515605364361114
x value of maximum y is: 2.8127090301003346
I want to get a list of random numbers from GMM.
I've plotted a distribution like this.
And I was wondering whether there is some way to get the value from the distribution curve, such as getting number 0.94, 0.96, 0.95.... How to get it?
I am not sure whether the belowing is useful for u or not. I got the code from Minimal reproduction
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
# x = open("prueba.dat").read().splitlines()
# create the data
x = np.concatenate((np.random.normal(5, 5, 1000),np.random.normal(10, 2, 1000)))
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
# f_axis = f.copy().ravel()
# f_axis.sort()
# plt.plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0],np.sqrt(covars[0])).ravel(), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()
Thanks!
According to the documentation you can simply do g.sample(n_samples=64) on the fitted model to generate 64 samples
I'm trying to make a small program that will plot a graph with best fit line and that will predict the COST value based on inputted SIZE value.
I always get this error, and I do not know what it means:
DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
No handles with labels found to put in legend.
This is the graph that I get (red), and I think that curve should look like green curve that i have draw.
And finally, program makes prediction only when I exit the graph.
What am I doing wrong?
This is the code:
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
size=[[1],[2],[3],[4],[5],[7],[9],[10],[11],[13]]
cost=[[10],[22],[35],[48],[60],[80],[92],[111],[118],[133]]
def predict(size,cost,x):
dates=np.reshape(size,(len(size),1))
svr_poly=SVR(kernel="poly",C=1e3, degree=2)
svr_poly.fit(size,cost)
plt.scatter(size,cost, color="blue")
plt.plot(cost, svr_poly.predict(cost), color="red")
plt.xlabel("Size")
plt.ylabel("Cost")
plt.title("prediction")
plt.legend()
plt.show()
predictedcost=predict(size,cost,7)
print(predictedcost)
Here, I found the answer to this problem. So if you are interested, check it
import numpy as np
import matplotlib.pyplot as plt
import math
X = np.array([1,2,3,5,6,7,4,7,8,9,5,10,11,7,6,6,10,11,11,12,13,13,14])
Y=np.array([2,3,5,8,11,14,9,19,15,19,15,16,14,7,13,13,14,13,23,25,26,27,33])
koeficienti_polinom = np.polyfit(X, Y, 2)
a=koeficienti_polinom[0]
b=koeficienti_polinom[1]
c=koeficienti_polinom[2]
xval=np.linspace(np.min(X), np.max(X))
regression=a * xval**2 + b*xval + c
predX = float(input("Enter: "))
predY = a * predX**2 + b*predX + c
plt.scatter(X,Y, s=20, color="blue" )
plt.scatter(predX, predY, color="red")
plt.plot(xval, regression, color="black", linewidth=1)
print("Kvadratno predvidjanje: ",round(predY,2))
My question pertains to bayesian inference and how to numerically calculate model evidence given some data and a prior and a posterior distribution.
Given conjugate priors, the wikipedia article specifies model evidence as the following:
Where sigma and beta are parameters, m is the model, Y is the data and X is the prior.
Given the setup below, how do I calculate model evidence? I need something that returns one scalar number.
Below I have a minimal working example of generating some data (draws from a normal) and assuming a prior (a normal) and a likelihood function (a gaussian). Notice how both the PDF of the data and the prior integrate to (approximately) one, while the likelihood function can take values over 1.
I am mainly confused as to how to "integrate out" the parameters from the model, and thus take model complexity into consideration. I can see how this can be done analytically if you can write down the log-likelihood function. But can't really see how this can result in one scalar number.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import scipy
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
%matplotlib inline
mu = 0
variance = 1
sigma = np.sqrt(variance)
data = np.random.normal(mu,variance,100)
x = np.linspace(-5,5,100)
density = scipy.stats.kde.gaussian_kde(data)
data_pdf = density(x)
prior_pdf = scipy.stats.norm.pdf(x, mu, sigma)
likelihood = np.exp(-np.power(x - mu, 2.) / (2 * np.power(sigma, 2.)))
I1=scipy.integrate.trapz(data_pdf,x)
I2=scipy.integrate.trapz(prior_pdf,x)
I3=scipy.integrate.trapz(likelihood,x)
fig1 = plt.figure(figsize=(7.5,5))
ax1 = fig1.add_subplot(3,1,1)
sns.despine(right=True)
ax1.plot(x,data_pdf,'k')
ax1.legend([r'$PDF(Data)$'],loc='upper left')
ax2 = fig1.add_subplot(3,1,2)
sns.despine(right=True)
ax2.plot(x,prior_pdf,'b')
ax2.legend([r'$Prior$'],loc='upper left')
ax3 = fig1.add_subplot(3,1,3)
sns.despine(right=True)
ax3.plot(x,likelihood,'r')
ax3.legend([r'$Likelihood$'],loc='upper left')
plt.tight_layout()
print(I1,I2,I3)
I'm using seaborn distplot (data, fit=stats.gamma)
How do I get the fit parameters returned?
Here is an example:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
df = pd.read_csv ('RequestSize.csv')
import matplotlib.pyplot as plt
reqs = df['12 web pages']
reqs = reqs.dropna()
reqs = reqs[np.logical_and (reqs > np.percentile (reqs, 0), reqs < np.percentile (reqs, 95))]
dist = sns.distplot (reqs, fit=stats.gamma)
Use the object you passed to distplot:
stats.gamma.fit(reqs)
I confirm the above is true - the sns.distplot fit method is equivalent to the fit method in scipy.stats so you can get the parameters from there, e.g.:
from scipy import stats
ax = sns.distplot(e_t_hat, bins=20, kde=False, fit=stats.norm);
plt.title('Distribution of Cointegrating Spread for Brent and Gasoil')
# Get the fitted parameters used by sns
(mu, sigma) = stats.norm.fit(e_t_hat)
print "mu={0}, sigma={1}".format(mu, sigma)
# Legend and labels
plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.ylabel('Frequency')
# Cross-check this is indeed the case - should be overlaid over black curve
x_dummy = np.linspace(stats.norm.ppf(0.01), stats.norm.ppf(0.99), 100)
ax.plot(x_dummy, stats.norm.pdf(x_dummy, mu, sigma))
plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma),
"cross-check"])