Gaussian mixture model using sklean to fit on data

Gaussian mixture model using sklean to fit on data - python

I am trying to fit gaussians on a given dataset.
Here is an example dataset.
I would like to find two reasonable gaussian to fit them.
Thus, I wrote the following code to use GMM.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn.mixture import GaussianMixture
from scipy.stats import norm
def main():
x = np.arange(-28,28,2)
y = np.array([0,1,2,3,4,5,5,5,4,3,1,1,0,0,0,0,0,0,1,2,3,3,3,2,1,0,0,1])
plt.plot(x,y,'ro')
plt.savefig("tmp.png")
plt.clf()
data = np.stack( (x,y), axis=-1)
print(data)
gmm = GaussianMixture(
n_components=2,
covariance_type='spherical',
init_params='random',
).fit(data.reshape(-1,1))
weights = gmm.weights_
means = gmm.means_
cov = gmm.covariances_
print(weights)
print(means)
print(cov)
gd0 = weights[0] * norm.pdf(x, means[0][0], np.sqrt(cov[0]))
gd1 = weights[1] * norm.pdf(x, means[1][0], np.sqrt(cov[1]))
plt.plot(x, gd0)
plt.plot(x, gd1)
plt.savefig("tmp2.png")
if __name__ == "__main__":
main()
Then, I get the result.
But it seems strange. I expected the peaks of Gaussians appeared around -15 and +15.
Where did I make a mistake?
weights
[0.52290556 0.47709444]
means
[[-0.78959748]
[ 1.68885233]]
cov
[250.29609772 3.07633333]

Related

How to generate a 3D contour plot using data for torsion angles as numpy arrays

I am trying to generate a 3D contour plot using data stored as lists for two angles phi2 and theta in degrees. I have in total 88 datapoints. I am trying to generate the joint multivariate normal DPF using the scipy stats multivariate_normal and then plot the graph. But the attached code does not work it gives me errors refered to that z is 1D and has to be 2D.
Could anybody be so kind of direct me on how to get a decent density surface and/or contour with the data I have and fix this code? Thank you in advance
This is my code:
phi2 = [68.74428813, 73.81435267, 66.13791645, 178.54309657, 179.52273055, 161.05564169,
157.29079181, 191.92405566, 91.49774385, 96.19566795, 70.59561195, 119.9603657,
120.22305924, 98.52577754, 102.37894512, 100.12088791, 150.21004667, 139.18249739,
139.09246089, 89.51031839, 88.39689092, 136.47397506, 286.26056406, 283.74464006,
290.17913953, 286.74459786, 284.86706369, 328.13937238, 275.44219073, 303.47499211,
260.52134486, 259.35788745, 306.90146741, 11.20622691, 10.78220574, 19.15446087,
12.15462016, 13.58160662, 3.83673279, 0.12494051, 17.73139875, 8.53784067, 16.50118845,
2.53838974, 233.88019465, 234.93195189, 229.57996459, 233.07447083, 233.59862002,
231.18392245, 207.88397566, 237.31741345, 183.95293031, 179.42872881, 213.32271268,
140.7533708, 150.16895446, 130.61256041, 130.89734197, 128.63260154, 12.06830893,
200.28087782, 189.90378416, 62.39275508, 58.30936802, 205.64840358, 277.30394295,
287.76441089, 284.93518941, 265.89041707, 265.04884345, 343.86712163, 9.14315768,
341.43239609, 259.68283323, 260.00152679, 319.65245694, 341.08153124, 328.45596486,
336.02665804, 334.51276135, 334.8480636, 14.23480894, 12.53212715, 326.89899848,
42.62591188, 45.9396189, 335.39967741]
theta = [162.30883091334002, 162.38681345640427, 159.9615814653753, 174.16782637794842,
174.2151437560705, 176.40150466571572, 172.99139904772483, 175.92043493594562,
170.54952038009057, 172.72436078059172, 157.8929621077973, 168.98842698365024,
171.98480108104968, 157.1025039563731, 158.00939405227624, 157.85195861050553,
171.7970456599733, 173.88542939027778, 174.13060483554227, 157.06302225640127,
156.68490146086768, 174.10583004433656, 12.057892850177469, 22.707446760473047,
10.351988334104147, 10.029845365897357, 9.685493520484972, 7.903767103756965,
2.4881977395826027, 5.95349444674959, 30.507921155263, 30.63344201861564,
12.408566633469452, 3.9720259901877712, 4.65662142520097, 4.638183341072918,
4.106777084823232, 4.080743212101051, 4.747614837690929, 5.50356343278645,
3.5832926179292923, 3.495358074328152, 2.980060059242138, 5.785575733164003,
172.46901133841854, 172.2062576963548, 173.0410300278859, 174.06303865166896,
174.21162725364357, 170.0470319897294, 174.10752252682713, 171.23903792872886,
172.86412623832285, 174.4850965754363, 172.82274147050111, 176.9008741019669,
177.0080169547876, 171.90883294152246, 173.22247813491, 173.4304905772758,
89.63634206258786, 175.70086864635368, 175.71009499829492, 162.5980851129683,
162.16583875715634, 175.35616287818408, 4.416907543506939, 4.249480386717373,
5.265265803392446, 21.091392446454336, 21.573883985068303, 7.135649687137961,
5.332884425609576, 1.4184699545284118, 24.487533963462965, 25.63021267148377,
5.005913657707176, 7.562769691801299, 7.52664594699765, 7.898159135060811,
7.167861631741688, 7.018092266267269, 5.939275995893341, 5.975608665369072,
7.138904478798905, 9.93153808410636, 9.415946863231648, 7.154298332687937]
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy import loadtxt
import matplotlib
from matplotlib.mlab import bivariate_normal
import math
from scipy.stats import multivariate_normal
from astropy.stats import circcorrcoef
from astropy import units as u
from scipy.stats import circvar
from scipy.stats import circmean
phi2_vacuum = np.array(phi2_vacuum)
theta_vacuum = np.array(theta_vacuum)
angle1 = np.radians(phi2_vacuum)
angle2 = np.radians(theta_vacuum)
# Obtain the circular variance
var_angle1 = circvar(angle1)
var_angle2 = circvar(angle2)
# Obtain circular mean from scipy
mean_angle1 = circmean(angle1)
mean_angle2 = circmean(angle2)
# Obtain circular covar between both angles in degrees
corr = circcorrcoef(angle1, angle2)
covar = corr * np.sqrt(var_angle1*var_angle2)
# Create the covar matrix
covar_matrix = np.array([[var_angle1, covar], [covar, var_angle2]])
# Obtain circular prob
delta = covar / (var_angle1 * var_angle2)
S = ((angle1-mean_angle1)/var_angle1) + ((angle2-mean_angle2)/var_angle2) - ((2*delta*
(angle1-mean_angle1)*(angle2-mean_angle2))/(var_angle1*var_angle2))
# Obtain exponential of PDF
exp = -1 * S / (2 * (1 - delta**2))
# Calculate the PDF
#prob = (1/(2*np.pi*var_angle1*var_angle2*np.sqrt(1-(delta**2)))) * np.e**exp
prob = multivariate_normal([mean_angle1, mean_angle2], covar_matrix)
# Create the stacking
pos = np.dstack((angle1, angle2))
fig2 = plt.figure()
ax2 = fig2.add_subplot(111)
ax2.contourf(angle1, angle2, pdf.pdf(pos))

Can you identify what wrong with this programme about normal equation implementation for linear regression

1.Here i got output with theta value with large numbers which is unusable
2.Can you determine what problem it has
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv("headbrain.csv")
data.head()
x=np.array(data["Head Size(cm^3)"].values)
y=np.array(data["Brain Weight(grams)"].values)
print(x.shape
x1=np.ones(len(y))
X=np.array([x,x1])
X.shape
#normal equation creating (x.transpose*x)*(x.transpose*y)
first=np.matmul(X,X.transpose()) #first part in normal equation(x.transpose*x)
second=np.matmul(X,y) #second part in nornal equation(x.transpose*y)
theta=np.matmul(first,second) #normal equation for theta
print(theta)
#i return theata values large number which includes e also```

import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv("headbrain.csv")
data.head()
x=np.array(data["Head Size(cm^3)"].values)
y=np.array(data["Brain Weight(grams)"].values)
print(x.shape)
x1=np.ones(len(y))
X=np.array([x,x1])
X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance
y_predict = X_new_b.dot(theta_best)
y_predict

how to replicate scipy.stats.fit using optimization function?

I am trying to fit a distribution to some values. This is my code
from __future__ import print_function
import pandas as pd
import numpy as np
import scipy as sp
import scipy.optimize as opt
import scipy.stats
import matplotlib.pyplot as plt
values = np.random.pareto(1.5, 10000)
loc = values.min()
scale = 1
def cost_function(alpha):
cost = -sp.stats.pareto(alpha, loc=loc, scale=scale).pdf(values)
return cost.sum()
opt_res = opt.fmin(cost_function, 1.5)
alpha_fit_v = sp.stats.pareto.fit(values, floc=loc, fscale=scale)
print('opt_res = ', opt_res,
' alpha_fit_v = ', alpha_fit_v)
I was expecting alpha_fit_v to be equivalent to opt_res but it is not. What's wrong?.

What's wrong?.
The cost function is wrong.
np.random.pareto has a different distribution than sp.stats.pareto
1. The cost function is wrong
It does not make sense to sum inverse probabilities. You need to use the logarithm:
def cost_function(alpha):
cost = -sp.stats.pareto(alpha, loc=loc, scale=scale).logpdf(values)
return cost.sum()
2. np.random.pareto has a different distribution than sp.stats.pareto
This one is tricky, but you may have noticed that not even sp.stats.pareto.fit returns the correct result. This is because scipy's Pareto distribution cannot fit the data generated by numpy.
import matplpotlib.pyplot as plt
import scipys as sp
import numpy as np
plt.subplot(2, 1, 1)
plt.hist(np.random.pareto(1.5, 10000), 1000) # This is a Lomax or Pareto II distribution
plt.xlim(0, 10)
plt.subplot(2, 1, 2)
plt.hist(sp.stats.pareto.rvs(1.5, size=1000), 1000) # This is a Pareto distribution
plt.xlim(0, 10)
That said, this will work as expected:
values = sp.stats.pareto.rvs(1.5, size=1000)
loc = 0
scale = 1
def cost_function(alpha):
cost = -sp.stats.pareto(alpha, loc=loc, scale=scale).logpdf(values)
return cost.sum()
opt_res = opt.fmin(cost_function, 1.5)
alpha_fit_v = sp.stats.pareto.fit(values, floc=loc, fscale=scale)
print('opt_res = ', opt_res,
' alpha_fit_v = ', alpha_fit_v)
# opt_res = [ 1.49611816] alpha_fit_v = (1.4960937500000013, 0, 1)
According to the documentation numpy.random.pareto does not quite draw from the Pareto distribution:
Draw samples from a Pareto II or Lomax distribution with specified shape.
The Lomax or Pareto II distribution is a shifted Pareto distribution. The classical Pareto distribution can be obtained from the Lomax distribution by adding 1 and multiplying by the scale parameter m (see Notes).
So you have two alternatives if using numpy to generate the data:
You can set loc=-1 for the scipy distribution.
You can do values = np.random.pareto(1.5, 10000) + 1 and set loc=0.

How do I get the components for LDA in scikit-learn?

When using PCA in sklearn, it's easy to get out the components:
from sklearn import decomposition
pca = decomposition.PCA(n_components=n_components)
pca_data = pca.fit(input_data)
pca_components = pca.components_
But I can't for the life of me figure out how to get the components out of LDA, as there is no components_ attribute. Is there a similar attribute in sklearn lda?

In the case of PCA, the documentation is clear. The pca.components_ are the eigenvectors.
In the case of LDA, we need the lda.scalings_ attribute.
Visual example using iris data and sklearn:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general it is a good idea to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
lda = LinearDiscriminantAnalysis()
lda.fit(X,y)
x_new = lda.transform(X)
Verify that the lda.scalings_ are the eigenvectors:
print(lda.scalings_)
print(lda.transform(np.identity(4)))
[[-0.67614337 0.0271192 ]
[-0.66890811 0.93115101]
[ 3.84228173 -1.63586613]
[ 2.17067434 2.13428251]]
[[-0.67614337 0.0271192 ]
[-0.66890811 0.93115101]
[ 3.84228173 -1.63586613]
[ 2.17067434 2.13428251]]
Additionally here is a useful function to plot the biplot and verify visually:
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
plt.scatter(xs ,ys, c = y) #without scaling
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlabel("LD{}".format(1))
plt.ylabel("LD{}".format(2))
plt.grid()
#Call the function.
myplot(x_new[:,0:2], lda.scalings_)
plt.show()
Results

My reading of the code is that the coef_ attribute is used to weight each of the components when scoring a sample's features against the different classes. scaling is the eigenvector and xbar_ is the mean. In the spirit of UTSL, here's the source for the decision function:
https://github.com/scikit-learn/scikit-learn/blob/6f32544c51b43d122dfbed8feff5cd2887bcac80/sklearn/discriminant_analysis.py#L166

In PCA, the transform operation uses self.components_.T (see the code):
X_transformed = np.dot(X, self.components_.T)
In LDA, the transform operation uses self.scalings_ (see the code):
X_new = np.dot(X, self.scalings_)
Note the .T which transposes the array in the PCA, and not in LDA:
PCA: components_ : array, shape (n_components, n_features)
LDA: scalings_ : array, shape (n_features, n_classes - 1)

Principal Component Analysis (PCA) in Python

I have a (26424 x 144) array and I want to perform PCA over it using Python. However, there is no particular place on the web that explains about how to achieve this task (There are some sites which just do PCA according to their own - there is no generalized way of doing so that I can find). Anybody with any sort of help will do great.

I posted my answer even though another answer has already been accepted; the accepted answer relies on a deprecated function; additionally, this deprecated function is based on Singular Value Decomposition (SVD), which (although perfectly valid) is the much more memory- and processor-intensive of the two general techniques for calculating PCA. This is particularly relevant here because of the size of the data array in the OP. Using covariance-based PCA, the array used in the computation flow is just 144 x 144, rather than 26424 x 144 (the dimensions of the original data array).
Here's a simple working implementation of PCA using the linalg module from SciPy. Because this implementation first calculates the covariance matrix, and then performs all subsequent calculations on this array, it uses far less memory than SVD-based PCA.
(the linalg module in NumPy can also be used with no change in the code below aside from the import statement, which would be from numpy import linalg as LA.)
The two key steps in this PCA implementation are:
calculating the covariance matrix; and
taking the eivenvectors & eigenvalues of this cov matrix
In the function below, the parameter dims_rescaled_data refers to the desired number of dimensions in the rescaled data matrix; this parameter has a default value of just two dimensions, but the code below isn't limited to two but it could be any value less than the column number of the original data array.
def PCA(data, dims_rescaled_data=2):
"""
returns: data transformed in 2 dims/columns + regenerated original data
pass in: data as 2D NumPy array
"""
import numpy as NP
from scipy import linalg as LA
m, n = data.shape
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = NP.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
evals, evecs = LA.eigh(R)
# sort eigenvalue in decreasing order
idx = NP.argsort(evals)[::-1]
evecs = evecs[:,idx]
# sort eigenvectors according to same index
evals = evals[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
evecs = evecs[:, :dims_rescaled_data]
# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors
return NP.dot(evecs.T, data.T).T, evals, evecs
def test_PCA(data, dims_rescaled_data=2):
'''
test by attempting to recover original data array from
the eigenvectors of its covariance matrix & comparing that
'recovered' array with the original data
'''
_ , _ , eigenvectors = PCA(data, dim_rescaled_data=2)
data_recovered = NP.dot(eigenvectors, m).T
data_recovered += data_recovered.mean(axis=0)
assert NP.allclose(data, data_recovered)
def plot_pca(data):
from matplotlib import pyplot as MPL
clr1 = '#2026B2'
fig = MPL.figure()
ax1 = fig.add_subplot(111)
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
MPL.show()
>>> # iris, probably the most widely used reference data set in ML
>>> df = "~/iris.csv"
>>> data = NP.loadtxt(df, delimiter=',')
>>> # remove class labels
>>> data = data[:,:-1]
>>> plot_pca(data)
The plot below is a visual representation of this PCA function on the iris data. As you can see, a 2D transformation cleanly separates class I from class II and class III (but not class II from class III, which in fact requires another dimension).

You can find a PCA function in the matplotlib module:
import numpy as np
from matplotlib.mlab import PCA
data = np.array(np.random.randint(10,size=(10,3)))
results = PCA(data)
results will store the various parameters of the PCA.
It is from the mlab part of matplotlib, which is the compatibility layer with the MATLAB syntax
EDIT:
on the blog nextgenetics I found a wonderful demonstration of how to perform and display a PCA with the matplotlib mlab module, have fun and check that blog!

Another Python PCA using numpy. The same idea as #doug but that one didn't run.
from numpy import array, dot, mean, std, empty, argsort
from numpy.linalg import eigh, solve
from numpy.random import randn
from matplotlib.pyplot import subplots, show
def cov(X):
"""
Covariance matrix
note: specifically for mean-centered data
note: numpy's `cov` uses N-1 as normalization
"""
return dot(X.T, X) / X.shape[0]
# N = data.shape[1]
# C = empty((N, N))
# for j in range(N):
# C[j, j] = mean(data[:, j] * data[:, j])
# for k in range(j + 1, N):
# C[j, k] = C[k, j] = mean(data[:, j] * data[:, k])
# return C
def pca(data, pc_count = None):
"""
Principal component analysis using eigenvalues
note: this mean-centers and auto-scales the data (in-place)
"""
data -= mean(data, 0)
data /= std(data, 0)
C = cov(data)
E, V = eigh(C)
key = argsort(E)[::-1][:pc_count]
E, V = E[key], V[:, key]
U = dot(data, V) # used to be dot(V.T, data.T).T
return U, E, V
""" test data """
data = array([randn(8) for k in range(150)])
data[:50, 2:4] += 5
data[50:, 2:5] += 5
""" visualize """
trans = pca(data, 3)[0]
fig, (ax1, ax2) = subplots(1, 2)
ax1.scatter(data[:50, 0], data[:50, 1], c = 'r')
ax1.scatter(data[50:, 0], data[50:, 1], c = 'b')
ax2.scatter(trans[:50, 0], trans[:50, 1], c = 'r')
ax2.scatter(trans[50:, 0], trans[50:, 1], c = 'b')
show()
Which yields the same thing as the much shorter
from sklearn.decomposition import PCA
def pca2(data, pc_count = None):
return PCA(n_components = 4).fit_transform(data)
As I understand it, using eigenvalues (first way) is better for high-dimensional data and fewer samples, whereas using Singular value decomposition is better if you have more samples than dimensions.

This is a job for numpy.
And here's a tutorial demonstrating how pincipal component analysis can be done using numpy's built-in modules like mean,cov,double,cumsum,dot,linalg,array,rank.
http://glowingpython.blogspot.sg/2011/07/principal-component-analysis-with-numpy.html
Notice that scipy also has a long explanation here
- https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105
with the scikit-learn library having more code examples -
https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105

Here are scikit-learn options. With both methods, StandardScaler was used because PCA is effected by scale
Method 1: Have scikit-learn choose the minimum number of principal components such that at least x% (90% in example below) of the variance is retained.
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
# mean-centers and auto-scales the data
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(.90)
principalComponents = pca.fit_transform(X = standardizedData)
# To get how many principal components was chosen
print(pca.n_components_)
Method 2: Choose the number of principal components (in this case, 2 was chosen)
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X = standardizedData)
# to get how much variance was retained
print(pca.explained_variance_ratio_.sum())
Source: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

UPDATE: matplotlib.mlab.PCA is since release 2.2 (2018-03-06) indeed deprecated.
The library matplotlib.mlab.PCA (used in this answer) is not deprecated. So for all the folks arriving here via Google, I'll post a complete working example tested with Python 2.7.
Use the following code with care as it uses a now deprecated library!
from matplotlib.mlab import PCA
import numpy
data = numpy.array( [[3,2,5], [-2,1,6], [-1,0,4], [4,3,4], [10,-5,-6]] )
pca = PCA(data)
Now in `pca.Y' is the original data matrix in terms of the principal components basis vectors. More details about the PCA object can be found here.
>>> pca.Y
array([[ 0.67629162, -0.49384752, 0.14489202],
[ 1.26314784, 0.60164795, 0.02858026],
[ 0.64937611, 0.69057287, -0.06833576],
[ 0.60697227, -0.90088738, -0.11194732],
[-3.19578784, 0.10251408, 0.00681079]])
You can use matplotlib.pyplot to draw this data, just to convince yourself that the PCA yields "good" results. The names list is just used to annotate our five vectors.
import matplotlib.pyplot
names = [ "A", "B", "C", "D", "E" ]
matplotlib.pyplot.scatter(pca.Y[:,0], pca.Y[:,1])
for label, x, y in zip(names, pca.Y[:,0], pca.Y[:,1]):
matplotlib.pyplot.annotate( label, xy=(x, y), xytext=(-2, 2), textcoords='offset points', ha='right', va='bottom' )
matplotlib.pyplot.show()
Looking at our original vectors we'll see that data[0] ("A") and data[3] ("D") are rather similar as are data[1] ("B") and data[2] ("C"). This is reflected in the 2D plot of our PCA transformed data.

In addition to all the other answers, here is some code to plot the biplot using sklearn and matplotlib.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
pca = PCA()
x_new = pca.fit_transform(X)
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()

I've made a little script for comparing the different PCAs appeared as an answer here:
import numpy as np
from scipy.linalg import svd
shape = (26424, 144)
repeat = 20
pca_components = 2
data = np.array(np.random.randint(255, size=shape)).astype('float64')
# data normalization
# data.dot(data.T)
# (U, s, Va) = svd(data, full_matrices=False)
# data = data / s[0]
from fbpca import diffsnorm
from timeit import default_timer as timer
from scipy.linalg import svd
start = timer()
for i in range(repeat):
(U, s, Va) = svd(data, full_matrices=False)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('svd time: %.3fms, error: %E' % (time*1000/repeat, err))
from matplotlib.mlab import PCA
start = timer()
_pca = PCA(data)
for i in range(repeat):
U = _pca.project(data)
time = timer() - start
err = diffsnorm(data, U, _pca.fracs, _pca.Wt)
print('matplotlib PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
from fbpca import pca
start = timer()
for i in range(repeat):
(U, s, Va) = pca(data, pca_components, True)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('facebook pca time: %.3fms, error: %E' % (time*1000/repeat, err))
from sklearn.decomposition import PCA
start = timer()
_pca = PCA(n_components = pca_components)
_pca.fit(data)
for i in range(repeat):
U = _pca.transform(data)
time = timer() - start
err = diffsnorm(data, U, _pca.explained_variance_, _pca.components_)
print('sklearn PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_mark(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s, Va.T)
print('pca by Mark time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_doug(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s[:pca_components], Va.T)
print('pca by doug time: %.3fms, error: %E' % (time*1000/repeat, err))
pca_mark is the pca in Mark's answer.
pca_doug is the pca in doug's answer.
Here is an example output (but the result depends very much on the data size and pca_components, so I'd recommend to run your own test with your own data. Also, facebook's pca is optimized for normalized data, so it will be faster and more accurate in that case):
svd time: 3212.228ms, error: 1.907320E-10
matplotlib PCA time: 879.210ms, error: 2.478853E+05
facebook pca time: 485.483ms, error: 1.260335E+04
sklearn PCA time: 169.832ms, error: 7.469847E+07
pca by Mark time: 293.758ms, error: 1.713129E+02
pca by doug time: 300.326ms, error: 1.707492E+02
EDIT:
The diffsnorm function from fbpca calculates the spectral-norm error of a Schur decomposition.

This will may be the simplest answer one can find for the PCA including easily understandable steps. Let say we want to retain 2 principal dimensions from the 144 which provides maximum information.
Firstly, convert your 2-D array to a dataframe:
import pandas as pd
# Here X is your array of size (26424 x 144)
data = pd.DataFrame(X)
Then, there are two methods one can go with:
Method 1: Manual calculation
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Find Co-variance matrix S of original matrix X
sample_data = standardized_data
covar_matrix = np.cov(sample_data)
Step 3: Find eigen values and eigen vectors of S (here 2D, so 2 of each)
from scipy.linalg import eigh
# eigh() function will provide eigen-values and eigen-vectors for a given matrix.
# eigvals=(low value, high value) takes eigen value numbers in ascending order
values, vectors = eigh(covar_matrix, eigvals=(142,143))
# Converting the eigen vectors into (2,d) shape for easyness of further computations
vectors = vectors.T
Step 4: Transform the data
# Projecting the original data sample on the plane formed by two principal eigen vectors by vector-vector multiplication.
new_coordinates = np.matmul(vectors, sample_data.T)
print(new_coordinates.T)
This new_coordinates.T will be of size (26424 x 2) with 2 principal components.
Method 2: Using Scikit-Learn
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Initializing the pca
from sklearn import decomposition
# n_components = numbers of dimenstions you want to retain
pca = decomposition.PCA(n_components=2)
Step 3: Using pca to fit the data
# This line takes care of calculating co-variance matrix, eigen values, eigen vectors and multiplying top 2 eigen vectors with data-matrix X.
pca_data = pca.fit_transform(sample_data)
This pca_data will be of size (26424 x 2) with 2 principal components.

For the sake def plot_pca(data): will work, it is necessary to replace the lines
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
with lines
newData, data_resc, data_orig = PCA(data)
ax1.plot(newData[:, 0], newData[:, 1], '.', mfc=clr1, mec=clr1)

this sample code loads the Japanese yield curve, and creates PCA components.
It then estimates a given date's move using the PCA and compares it against the actual move.
%matplotlib inline
import numpy as np
import scipy as sc
from scipy import stats
from IPython.display import display, HTML
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import quandl as ql
start = "2016-10-04"
end = "2019-10-04"
ql_data = ql.get("MOFJ/INTEREST_RATE_JAPAN", start_date = start, end_date = end).sort_index(ascending= False)
eigVal_, eigVec_ = np.linalg.eig(((ql_data[:300]).diff(-1)*100).cov()) # take latest 300 data-rows and normalize to bp
print('number of PCA are', len(eigVal_))
loc_ = 10
plt.plot(eigVec_[:,0], label = 'PCA1')
plt.plot(eigVec_[:,1], label = 'PCA2')
plt.plot(eigVec_[:,2], label = 'PCA3')
plt.xticks(range(len(eigVec_[:,0])), ql_data.columns)
plt.legend()
plt.show()
x = ql_data.diff(-1).iloc[loc_].values * 100 # set the differences
x_ = x[:,np.newaxis]
a1, _, _, _ = np.linalg.lstsq(eigVec_[:,0][:, np.newaxis], x_) # linear regression without intercept
a2, _, _, _ = np.linalg.lstsq(eigVec_[:,1][:, np.newaxis], x_)
a3, _, _, _ = np.linalg.lstsq(eigVec_[:,2][:, np.newaxis], x_)
pca_mv = m1 * eigVec_[:,0] + m2 * eigVec_[:,1] + m3 * eigVec_[:,2] + c1 + c2 + c3
pca_MV = a1[0][0] * eigVec_[:,0] + a2[0][0] * eigVec_[:,1] + a3[0][0] * eigVec_[:,2]
pca_mV = b1 * eigVec_[:,0] + b2 * eigVec_[:,1] + b3 * eigVec_[:,2]
display(pd.DataFrame([eigVec_[:,0], eigVec_[:,1], eigVec_[:,2], x, pca_MV]))
print('PCA1 regression is', a1, a2, a3)
plt.plot(pca_MV)
plt.title('this is with regression and no intercept')
plt.plot(ql_data.diff(-1).iloc[loc_].values * 100, )
plt.title('this is with actual moves')
plt.show()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Gaussian mixture model using sklean to fit on data - python

Related

How to generate a 3D contour plot using data for torsion angles as numpy arrays

Can you identify what wrong with this programme about normal equation implementation for linear regression

how to replicate scipy.stats.fit using optimization function?

How do I get the components for LDA in scikit-learn?

Principal Component Analysis (PCA) in Python

Categories

Resources