PCA shows low explained variance score - python

I used PCA to visualise 100 dimensional data into two dimensions:
x = df.loc[:, features].values # shape of: (8000, 100)
y = df.loc[:,['target']].values
x = StandardScaler().fit_transform(x)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
bb = pca.explained_variance_.sum()
print (bb) gives: 7.952215165291998
np.cumsum((pca.explained_variance_)) gives: [4.87586249 7.95221329]
pca.explained_variance_ratio_ gives: [0.04875253 0.03075967]
plotting it gives:
plt.plot(np.cumsum((pca.explained_variance_)))
I'm finding it hard to believe that two components only account for ~8% of explained variance and these results look odd... I would have thought that PCA works to retain as much variance as possible from the data? Or have I been calculating it wrong?
x[0:5]:
[[-1.74060e-01 -9.57560e-02 5.95150e-02 3.96730e-02 -3.75934e-01
-1.15415e-01 9.07250e-02 1.73422e-01 2.92520e-01 1.90375e-01
9.40910e-02 -1.97482e-01 -1.35202e-01 7.55210e-02 1.10771e-01
4.79090e-02 -3.91934e-01 7.35480e-02 1.03868e-01 -4.59240e-02
-9.53400e-03 5.56590e-02 -3.08000e-04 2.15941e-01 8.44760e-02
6.15730e-02 1.28139e-01 1.84247e-01 -1.00091e-01 -1.26661e-01
-5.72800e-03 -3.82720e-02 1.80597e-01 -1.55310e-01 5.62320e-02
-5.92500e-03 -8.53810e-02 -5.69210e-02 -4.55200e-02 2.65116e-01
9.02210e-02 -2.09879e-01 2.05381e-01 2.36790e-02 -9.29390e-02
7.27670e-02 -1.05107e-01 1.11120e-02 -1.60518e-01 4.26270e-02
1.51230e-01 -1.62708e-01 -8.34790e-02 -1.46657e-01 9.13320e-02
1.09579e-01 -1.01678e-01 9.11980e-02 5.51200e-03 4.73180e-02
7.81080e-02 2.03824e-01 -1.00126e-01 2.94703e-01 -1.58841e-01
2.93330e-02 7.82650e-02 1.85240e-02 1.17082e-01 2.12755e-01
-1.71555e-01 2.94210e-02 1.49264e-01 4.65990e-02 -1.84111e-01
2.94123e-01 -1.01497e-01 -3.01230e-02 -9.82600e-03 7.83500e-03
-1.06508e-01 -1.66202e-01 -2.47480e-02 -9.08560e-02 5.69770e-02
4.76440e-02 1.86180e-02 -3.43760e-02 8.70130e-02 -2.78817e-01
2.44482e-01 1.59740e-02 1.29030e-02 1.37528e-01 1.38140e-01
5.47400e-03 7.07190e-02 -1.64084e-01 -1.79274e-01 1.84899e-01]
[-4.68470e-02 6.17350e-02 -9.39700e-03 -3.21487e-01 1.35940e-02
1.49780e-02 -7.18690e-02 -1.17625e-01 -1.41148e-01 1.39589e-01
-9.45150e-02 8.05100e-03 1.79180e-02 7.12750e-02 7.12070e-02
2.07655e-01 -1.67857e-01 -1.36745e-01 -8.67250e-02 -8.45730e-02
7.17540e-02 2.13712e-01 -1.42905e-01 1.72995e-01 1.47124e-01
-1.66269e-01 -1.69885e-01 1.12125e-01 -1.93325e-01 -7.24470e-02
-1.71490e-02 4.64500e-03 8.49620e-02 -4.38140e-02 4.29490e-02
-1.68999e-01 1.60550e-02 -5.94520e-02 6.43960e-02 1.77443e-01
1.69347e-01 1.82960e-02 -3.44550e-02 -2.13336e-01 2.18972e-01
-1.19635e-01 -5.86050e-02 4.53050e-02 3.17350e-02 -6.87390e-02
6.33390e-02 -1.52046e-01 -3.90520e-02 7.61900e-03 -2.72430e-02
-4.35770e-02 1.11760e-02 1.13349e-01 3.61400e-03 -1.33787e-01
5.33930e-02 2.43900e-01 -1.73750e-02 3.30470e-02 -1.26435e-01
2.20994e-01 1.36751e-01 7.55210e-02 3.70490e-02 1.03420e-01
-1.86472e-01 -7.57850e-02 5.51050e-02 -2.03208e-01 -2.43398e-01
1.16064e-01 -7.72510e-02 4.86940e-02 -2.92860e-02 2.80506e-01
1.91691e-01 -3.82540e-02 4.93140e-02 -4.55500e-02 -2.66730e-02
1.02430e-01 -2.89260e-02 1.02755e-01 2.31370e-02 -4.89750e-02
7.47940e-02 -4.98280e-02 -7.70930e-02 -5.11840e-02 -1.39240e-01
-2.53120e-02 -5.70250e-02 2.42180e-02 -2.99657e-01 2.67578e-01]
[-8.39500e-02 -1.01824e-01 1.81731e-01 1.40670e-02 -1.87580e-01
3.42900e-02 -5.89300e-03 5.36770e-02 8.36230e-02 7.62810e-02
-3.10600e-02 -1.75981e-01 -6.96800e-02 5.37630e-02 3.73400e-03
-2.00760e-02 -1.36140e-01 -3.45200e-02 1.49700e-02 -1.17830e-02
1.06793e-01 -2.83580e-02 -2.97240e-02 1.36660e-02 3.17110e-02
1.12940e-02 6.95800e-03 8.58400e-03 -5.91890e-02 3.51190e-02
-6.20280e-02 -1.41275e-01 1.63360e-02 -7.87150e-02 2.49130e-02
7.65590e-02 3.56440e-02 1.00088e-01 -1.68320e-02 1.70951e-01
-5.06570e-02 4.01900e-03 1.20666e-01 -3.50180e-02 1.05963e-01
1.57038e-01 -1.31850e-02 -1.61995e-01 -1.45312e-01 -8.77100e-03
6.61630e-02 -1.39293e-01 8.08510e-02 -1.01238e-01 1.00171e-01
-2.15740e-02 5.21600e-03 8.72330e-02 5.83300e-03 -8.70730e-02
4.70680e-02 1.46379e-01 -5.07890e-02 3.28496e-01 -4.51590e-02
-2.28540e-02 4.57660e-02 1.04651e-01 4.37010e-02 4.18220e-02
1.01754e-01 -3.13950e-02 7.72320e-02 7.03310e-02 -1.18943e-01
4.41870e-02 1.52218e-01 8.59600e-02 2.19597e-01 -9.52670e-02
3.16670e-02 -1.12022e-01 7.76510e-02 -1.72850e-02 -3.52630e-02
-5.84850e-02 1.52377e-01 1.38989e-01 8.47150e-02 -2.19000e-02
1.45324e-01 -2.16640e-01 4.20000e-04 1.62572e-01 -6.87500e-03
7.32830e-02 7.23580e-02 6.25200e-03 -1.50705e-01 1.90830e-02]
[-4.92830e-02 -1.13529e-01 4.63170e-02 7.55490e-02 -2.53431e-01
-1.28370e-02 5.40900e-02 6.77460e-02 2.01792e-01 1.31060e-01
-5.10200e-02 -1.03544e-01 -5.71250e-02 5.35760e-02 -5.31680e-02
1.08214e-01 -1.02481e-01 -2.12630e-02 3.54400e-03 6.67380e-02
-2.23950e-02 3.77230e-02 -2.59860e-02 1.21146e-01 9.54170e-02
8.16600e-03 1.74307e-01 6.37850e-02 -8.07500e-02 -1.11577e-01
4.97500e-03 -6.23630e-02 1.84070e-02 -6.68690e-02 6.85410e-02
-2.38730e-02 6.21030e-02 -3.99490e-02 9.33350e-02 1.46926e-01
-2.35010e-02 -1.13768e-01 2.67340e-02 5.61810e-02 -4.09180e-02
1.29777e-01 -7.41130e-02 -8.26530e-02 -5.31730e-02 -2.83570e-02
1.57710e-01 -1.42213e-01 -6.54910e-02 -8.45030e-02 2.09025e-01
-1.64135e-01 -6.46500e-03 1.72252e-01 3.20320e-02 -1.55267e-01
-9.08500e-03 2.01181e-01 2.16680e-02 1.64786e-01 -1.31938e-01
7.84050e-02 5.32340e-02 -2.56440e-02 2.46850e-02 2.69000e-03
-4.96370e-02 8.45840e-02 8.18290e-02 1.01170e-02 -6.31440e-02
1.85050e-01 1.31864e-01 3.59370e-02 8.24870e-02 -1.30290e-02
-1.23513e-01 -4.67360e-02 -2.54870e-02 -8.89020e-02 1.54840e-02
3.94370e-02 -7.99840e-02 8.79100e-02 -3.17460e-02 -9.01510e-02
1.22541e-01 -1.10243e-01 4.65500e-02 1.20022e-01 2.12911e-01
7.04800e-03 1.27260e-02 1.50930e-02 -8.74320e-02 -1.56960e-02]
[-8.14410e-02 1.03415e-01 1.54480e-01 -1.13821e-01 -2.59800e-02
-9.11840e-02 1.98150e-02 6.13800e-03 4.90270e-02 3.57080e-02
-7.22750e-02 -8.20920e-02 2.53740e-02 -3.39230e-02 1.08040e-02
2.57740e-02 -9.17830e-02 -1.15800e-03 -3.99410e-02 -8.08910e-02
-5.23800e-03 8.22700e-03 -5.80400e-02 3.08770e-02 3.06770e-02
-2.71200e-02 5.21440e-02 6.13250e-02 1.60370e-02 5.94100e-02
-7.47300e-03 -1.66967e-01 -1.03830e-02 -5.26470e-02 5.42000e-02
1.37011e-01 -4.50600e-02 5.02110e-02 -3.77980e-02 3.98710e-02
5.67540e-02 8.18000e-04 2.33870e-02 5.97900e-03 2.37760e-02
8.12210e-02 9.60000e-03 -7.91730e-02 -3.07628e-01 -3.24800e-03
-1.21500e-01 -1.31835e-01 -3.74470e-02 -2.80460e-02 1.58474e-01
3.60790e-02 -6.30950e-02 1.06043e-01 -1.70946e-01 2.19370e-02
-4.61270e-02 2.44280e-02 -2.40631e-01 1.39449e-01 -1.03286e-01
6.95440e-02 9.92640e-02 3.34970e-02 -6.07200e-03 7.61150e-02
-1.13122e-01 -4.28310e-02 1.04957e-01 -7.01560e-02 -1.10580e-01
6.40850e-02 7.95360e-02 7.02870e-02 -6.11520e-02 -4.34710e-02
1.92430e-02 -1.34928e-01 9.18830e-02 1.16445e-01 7.33020e-02
-7.95920e-02 4.32490e-02 1.45400e-03 1.32220e-02 -1.55543e-01
8.48930e-02 -1.19340e-01 5.97000e-02 -3.51640e-02 -1.36560e-02
-7.47650e-02 -1.87107e-01 -1.25462e-01 -1.57231e-01 5.17430e-02]]

You need to look at pca.explained_variance_ratio_, which gives the explained variance in percent (that is 1.0 for 100%).
pca.explained_variance_ is related to the Eigenvalues themselves.
In order to visualize the datapoints in 2D, you could scatter-plot the first and second principal components, not the explained variance:
# from matplotlib import pyplot as plt
plt.scatter(principalComponents[:, 0], principalComponents[:, 1], marker='x')

Related

Why does sklearn and numpy disagree about multiplying component of PCA?

from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
SEED = 123
X, y = make_blobs(n_samples=1000, n_features=5000, cluster_std=90., random_state=SEED)
pca = PCA(2)
pca.fit(X)
pca1, pca2 = pca.components_
pcaX = pca.transform(X)
pcaXnp = np.array([X # pca1, X # pca2]).T
And if you print out pcaX and pcaXnp you'll see that they're similar but that they don't agree with each other. Why should these differ? It seems like ".components_" should return what sklearn is going to multiply the matrix by, is there a reason why it's just an approximation of what the multiplication will be?
PCA from sklearn.decomposition uses singular value decomposition, or SVD to obtain your principal components. This works only when the columns have first been centered by their means. If you check the source code, they do the centering before SVD:
def _fit_full(self, X, n_components):
[...]
# Center data
self.mean_ = np.mean(X, axis=0)
X -= self.mean_
So to get the PCA scores, you need to center your matrix first:
pcaX = pca.transform(X)
Xc = X - X.mean(axis=0)
pcaXnp = np.array([Xc # pca1, Xc # pca2]).T
pcaX[:3]
array([[-101.45177987, 212.45583745],
[ 520.84541298, 87.32254399],
[-273.26407231, -318.78493994]])
pcaXnp[:3]
array([[-101.45177987, 212.45583745],
[ 520.84541298, 87.32254399],
[-273.26407231, -318.78493994]])

How to find regression curve equation for a fitted PolynomialFeatures model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
data=pd.DataFrame(
{"input":
[0.001,0.015,0.066,0.151,0.266,0.402,0.45,0.499,0.598,0.646,0.738,0.782,0.86,0.894,0.924,0.95],
"output":[0.5263157894736842,0.5789473684210524,0.6315789473684206,0.6842105263157897,
0.6315789473684206, 0.7894736842105263, 0.8421052631578945, 0.7894736842105263, 0.736842105263158,
0.6842105263157897, 0.736842105263158, 0.736842105263158,0.6842105263157897, 0.6842105263157897,
0.6315789473684206,0.5789473684210524]})
I have the above data that includes input and output data and ı want to make a curve that properly fits this data. Firstly plotting of input and output values are here :
I have made this code:
X=data.iloc[:,0].to_numpy()
X=X.reshape(-1,1)
y=data.iloc[:,1].to_numpy()
y=y.reshape(-1,1)
poly=PolynomialFeatures(degree=2)
poly.fit(X,y)
X_poly=poly.transform(X)
reg=LinearRegression().fit(X_poly,y)
plt.scatter(X,y,color="blue")
plt.plot(X,reg.predict(X_poly),color="orange",label="Polynomial Linear Regression")
plt.xlabel("Temperature")
plt.ylabel("Pressure")
plt.legend(loc="upper left")
plot is:
But ı don't find the above curve's equation (orange curve) how can ı find?
Your plot actually corresponds to your code run with
poly=PolynomialFeatures(degree=7)
and not to degree=2. Indeed, running your code with the above change, we get:
Now, your polynomial features are:
poly.get_feature_names()
# ['1', 'x0', 'x0^2', 'x0^3', 'x0^4', 'x0^5', 'x0^6', 'x0^7']
and the respective coefficients of your linear regression are:
reg.coef_
# array([[ 0. , 5.43894411, -68.14277256, 364.28508827,
# -941.70924401, 1254.89358662, -831.27091422, 216.43304954]])
plus the intercept:
reg.intercept_
# array([0.51228593])
Given the above, and setting
coef = reg.coef_[0]
since here we have a single feature in the initial data, your regression equation is:
y = reg.intercept_ + coef[0] + coef[1]*x + coef[2]*x**2 + coef[3]*x**3 + coef[4]*x**4 + coef[5]*x**5 + coef[6]*x**6 + coef[7]*x**7
For visual verification, we can plot the above function with some x data in [0, 1]
x = np.linspace(0, 1, 15)
Running the above expression for y and
plt.plot(x, y)
gives:
Using some randomly generated data x, we can verify that the results of the equation y_eq are indeed equal to the results produced by the regression model y_reg within the limits of numerical precision:
x = np.random.rand(1,10)
y_eq = reg.intercept_ + coef[0] + coef[1]*x + coef[2]*x**2 + coef[3]*x**3 + coef[4]*x**4 + coef[5]*x**5 + coef[6]*x**6 + coef[7]*x**7
y_reg = np.concatenate(reg.predict(poly.transform(x.reshape(-1,1))))
y_eq
# array([[0.72452703, 0.64106819, 0.67394222, 0.71756648, 0.71102853,
# 0.63582055, 0.54243177, 0.71104983, 0.71287962, 0.6311952 ]])
y_reg
# array([0.72452703, 0.64106819, 0.67394222, 0.71756648, 0.71102853,
# 0.63582055, 0.54243177, 0.71104983, 0.71287962, 0.6311952 ])
np.allclose(y_reg, y_eq)
# True
Irrelevant to the question, I guess you already know that trying to fit such high order polynomials to so few data points is not a good idea, and you probably should remain to a low degree of 2 or 3...
Note sure how you produced the plot shown in the question. When I ran your code I got the following (degree=2) polynomial fitted to the data as expected:
Now that you have fitted the data you can see the coefficients of the model thus:
print(reg.coef_)
print(reg.intercept_)
# [[ 0. 0.85962436 -0.83796885]]
# [0.5523586]
Note that the data that was used to fit this model is equivalent to the following:
X_poly = np.concatenate([np.ones((16,1)), X, X**2], axis=1)
Therefore a single data point is a vector created as follows:
temp = 0.5
x = np.array([1, temp, temp**2]).reshape((1,3))
Your polynomial model is simply a linear model of the polynomial features:
y = A.x + B
or
y = reg.coef_.dot(x.T) + reg.intercept_
print(y) # [[0.77267856]]
Verification:
print(reg.predict(x)) # array([[0.77267856]])

numpy vectorized approach to regression -multiple dependent columns (x) on single independent columns (y)

consider the below (3, 13) np.array
from scipy.stats import linregress
a = [-0.00845,-0.00568,-0.01286,-0.01302,-0.02212,-0.01501,-0.02132,-0.00783,-0.00942,0.00158,-0.00016,0.01422,0.01241]
b = [0.00115,0.00623,0.00160,0.00660,0.00951,0.01258,0.00787,0.01854,0.01462,0.01479,0.00980,0.00607,-0.00106]
c = [-0.00233,-0.00467,0.00000,0.00000,-0.00952,-0.00949,-0.00958,-0.01696,-0.02212,-0.01006,-0.00270,0.00763,0.01005]
array = np.array([a,b,c])
yvalues = pd.to_datetime(['2019-12-15','2019-12-16','2019-12-17','2019-12-18','2019-12-19','2019-12-22','2019-12-23','2019-12-24',\
'2019-12-25','2019-12-26','2019-12-29','2019-12-30','2019-12-31'], errors='coerce')
I can run the OLS regression on one column at a time successfully, as in below:
out = linregress(array[0], y=yvalues.to_julian_date())
print(out)
LinregressResult(slope=329.141087037396, intercept=2458842.411731361, rvalue=0.684426534581417, pvalue=0.009863937200252878, stderr=105.71465449878443)
However, what i wish to accomplish is to: run the regression on the matrix array with 'y' variable (yvalues) being constant for all columns -in one go (loop is possible solution but tiresome). I tried to extend 'yvalues' to match array shape with (np.tile). but is seems not to be the right approach. thank you all for your help.
IIUC you are looking for something like the following list comprehension in a vectorized way:
out = [linregress(array[i], y=yvalues.to_julian_date()) for i in range(array.shape[0])]
out
[LinregressResult(slope=329.141087037396, intercept=2458842.411731361, rvalue=0.684426534581417, pvalue=0.009863937200252876, stderr=105.71465449878443),
LinregressResult(slope=178.44888292241782, intercept=2458838.7056912296, rvalue=0.1911788042719021, pvalue=0.5315353013148307, stderr=276.24376878908953),
LinregressResult(slope=106.86168938856262, intercept=2458840.7656617565, rvalue=0.17721031419860186, pvalue=0.5624701260912525, stderr=178.940293876864)]
To be honest I've never seen what you are looking for implemented using scipy or statsmodels functionalities.
Therefore we can implement it ourselves exploiting numpy broadcasting:
x = array
y = np.array(yvalues.to_julian_date())
# mean of our inputs and outputs
x_mean = np.mean(x, axis=1)
y_mean = np.mean(y)
#total number of values
n = x.shape[1]
# using the formula to calculate the slope and intercept
n = np.sum((x - x_mean[:,np.newaxis]) * (y - y_mean)[np.newaxis,:], axis=1)
d = np.sum((x - x_mean[:,np.newaxis])**2, axis=1)
slopes = n/d
intercepts = y_mean - slopes*x_mean
slopes
array([329.14108704, 178.44888292, 106.86168939])
intercepts
array([2458842.41173136, 2458838.70569123, 2458840.76566176])

power-law curve fitting scipy, numpy not working

I came up with a problem in fitting a power-law curve on my data. I have two data sets: bins1 and bins2
bins1 acting fine in curve-fitting by using numpy.linalg.lstsq (I then use np.exp(coefs[0])*x**coefs[1] to get power-law equation)
On the other hand, bins2 is acting weird and shows a bad R-squared
Both data have different equations than what excel shows me (and worse R-squared).
here is the code (and data):
import numpy as np
import matplotlib.pyplot as plt
bins1 = np.array([[6.769318871738219667e-03,
1.306418618130891773e-02,
1.912138120913448383e-02,
2.545189874466026111e-02,
3.214689891729670401e-02,
4.101898933375244805e-02,
5.129862592803200588e-02,
6.636505322669797313e-02,
8.409809827572585494e-02,
1.058164348650862258e-01,
1.375849753230810046e-01,
1.830664031837437311e-01,
2.682454535427478137e-01,
3.912508246490400410e-01,
5.893271848997768680e-01,
8.480213305038615257e-01,
2.408136266017391058e+00,
3.629192766488219313e+00,
4.639246557509275171e+00,
9.901792214343277720e+00],
[8.501658465758301112e-04,
1.562697718429977012e-03,
1.902062808421856087e-04,
4.411817741488644959e-03,
3.409236963162485048e-03,
1.686099657013027898e-03,
3.643231240239608402e-03,
2.544120616413291154e-04,
2.549036204611017029e-02,
3.527340723977697573e-02,
5.038482027310990652e-02,
5.617932487522721979e-02,
1.620407270423956103e-01,
1.906538999080910068e-01,
3.180688368126549093e-01,
2.364903188268162038e-01,
3.267322385964683273e-01,
9.384571074801122403e-01,
4.419747716107813029e-01,
9.254710022316929852e+00]]).T
bins2 = np.array([[6.522512685133712192e-03,
1.300415548684437199e-02,
1.888928895701269539e-02,
2.509905819337970856e-02,
3.239654633369139919e-02,
4.130706234846069635e-02,
5.123820846515786398e-02,
6.444380072984744190e-02,
8.235238352205621892e-02,
1.070907072127811749e-01,
1.403438221033725120e-01,
1.863115065963684147e-01,
2.670209758710758163e-01,
4.003337413814173074e-01,
6.549054078382223754e-01,
1.116611087124244062e+00,
2.438604844718367914e+00,
3.480674117919704269e+00,
4.410201659398489404e+00,
6.401903059926267403e+00],
[1.793454543936148608e-03,
2.441092334386309615e-03,
2.754373929745804715e-03,
1.182752729942167062e-03,
1.357797177773524414e-03,
6.711673916715021199e-03,
1.392761674092503343e-02,
1.127957613093066511e-02,
7.928803089359596004e-03,
2.524609593305639915e-02,
5.698702885370290905e-02,
8.607729156137132465e-02,
2.453761830112021203e-01,
9.734443815196883176e-02,
1.487480479168299119e-01,
9.918002699934079791e-01,
1.121298151253063535e+00,
1.389239135742518227e+00,
4.254082922056571237e-01,
2.643453492951096440e+00]]).T
bins = bins1 #change to bins2 to see results for bins2
def fit(x,a,m): # power-law fit (based on previous studies)
return a*(x**m)
coefs= np.linalg.lstsq(np.vstack([np.ones(len(bins[:,0])), np.log(bins[:,0]), bins[:,0]]).T, np.log(bins[:,1]))[0] # calculating fitting coefficients (a,m)
y_predict = fit(bins[:,0],np.exp(coefs[0]),coefs[1]) # prediction based of fitted model
model_plot = plt.loglog(bins[:,0],bins[:,1],'o',label="error")
fit_line = plt.plot(bins[:,0],y_predict,'r', label="fit")
plt.ylabel('Y (bins[:,1])')
plt.xlabel('X (bins[:,0])')
plt.title('model')
plt.legend(loc='best')
plt.show(model_plot,fit_line)
def R_sqr (y,y_predict): # calculating R squared value to measure fitting accuracy
rsdl = y - y_predict
ss_res = np.sum(rsdl**2)
ss_tot = np.sum((y-np.mean(y))**2)
R2 = 1-(ss_res/ss_tot)
R2 = np.around(R2,decimals=4)
return R2
R2= R_sqr(bins[:,1],y_predict)
print ('(R^2 = %s)' % (R2))
The fit formula for bins1[[x],[y]]: python: y = 0.337*(x)^1.223 (R^2 = 0.7773), excel: y = 0.289*(x)^1.174 (R^2 = 0.8548)
The fit formula for bins2[[x],[y]]: python: y = 0.509*(x)^1.332 (R^2 = -1.753), excel: y = 0.311*(x)^1.174 (R^2 = 0.9116)
And these are two sample data sets out of 30, I randomly see this fitting problem in my data and some have R-squared around "-150"!!
Itried scipy "curve_fit" but I didn't get better results, in fact worse!
Anyone knows how to get excel-like fit in python?
You are trying to calculate an R-squared using Y's that have not been converted to log-space. The following change gives reasonable R-squared values:
R2 = R_sqr(np.log(bins[:,1]), np.log(y_predict))

python scikit-learn clustering with missing data

I want to cluster data with missing columns. Doing it manually I would calculate the distance in case of a missing column simply without this column.
With scikit-learn, missing data is not possible. There is also no chance to specify a user distance function.
Is there any chance to cluster with missing data?
Example data:
n_samples = 1500
noise = 0.05
X, _ = make_swiss_roll(n_samples, noise)
rnd = np.random.rand(X.shape[0],X.shape[1])
X[rnd<0.1] = np.nan
I think you can use an iterative EM-type algorithm:
Initialize missing values to their column means
Repeat until convergence:
Perform K-means clustering on the filled-in data
Set the missing values to the centroid coordinates of the clusters to which they were assigned
Implementation
import numpy as np
from sklearn.cluster import KMeans
def kmeans_missing(X, n_clusters, max_iter=10):
"""Perform K-Means clustering on data with missing values.
Args:
X: An [n_samples, n_features] array of data to cluster.
n_clusters: Number of clusters to form.
max_iter: Maximum number of EM iterations to perform.
Returns:
labels: An [n_samples] vector of integer labels.
centroids: An [n_clusters, n_features] array of cluster centroids.
X_hat: Copy of X with the missing values filled in.
"""
# Initialize missing values to their column means
missing = ~np.isfinite(X)
mu = np.nanmean(X, 0, keepdims=1)
X_hat = np.where(missing, mu, X)
for i in xrange(max_iter):
if i > 0:
# initialize KMeans with the previous set of centroids. this is much
# faster and makes it easier to check convergence (since labels
# won't be permuted on every iteration), but might be more prone to
# getting stuck in local minima.
cls = KMeans(n_clusters, init=prev_centroids)
else:
# do multiple random initializations in parallel
cls = KMeans(n_clusters, n_jobs=-1)
# perform clustering on the filled-in data
labels = cls.fit_predict(X_hat)
centroids = cls.cluster_centers_
# fill in the missing values based on their cluster centroids
X_hat[missing] = centroids[labels][missing]
# when the labels have stopped changing then we have converged
if i > 0 and np.all(labels == prev_labels):
break
prev_labels = labels
prev_centroids = cls.cluster_centers_
return labels, centroids, X_hat
Example with fake data
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=3, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, Xm = make_fake_data(fraction_missing=0.3, n_clusters=5, seed=0)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
# plot the inferred points, color-coded according to the true cluster labels
fig, ax = plt.subplots(1, 2, subplot_kw={'projection':'3d', 'aspect':'equal'})
ax[0].scatter3D(X[:, 0], X[:, 1], X[:, 2], c=true_labels, cmap='gist_rainbow')
ax[1].scatter3D(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=true_labels,
cmap='gist_rainbow')
ax[0].set_title('Original data')
ax[1].set_title('Imputed (30% missing values)')
fig.tight_layout()
Benchmark
To assess the algorithm's performance, we can use the adjusted mutual information between the true and inferred cluster labels. A score of 1 is perfect performance and 0 represents chance:
from sklearn.metrics import adjusted_mutual_info_score
fraction = np.arange(0.0, 1.0, 0.05)
n_repeat = 10
scores = np.empty((2, fraction.shape[0], n_repeat))
for i, frac in enumerate(fraction):
for j in range(n_repeat):
X, true_labels, Xm = make_fake_data(fraction_missing=frac, n_clusters=5)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
any_missing = np.any(~np.isfinite(Xm), 1)
scores[0, i, j] = adjusted_mutual_info_score(labels, true_labels)
scores[1, i, j] = adjusted_mutual_info_score(labels[any_missing],
true_labels[any_missing])
fig, ax = plt.subplots(1, 1)
scores_all, scores_missing = scores
ax.errorbar(fraction * 100, scores_all.mean(-1),
yerr=scores_all.std(-1), label='All labels')
ax.errorbar(fraction * 100, scores_missing.mean(-1),
yerr=scores_missing.std(-1),
label='Labels with missing values')
ax.set_xlabel('% missing values')
ax.set_ylabel('Adjusted mutual information')
ax.legend(loc='best', frameon=False)
ax.set_ylim(0, 1)
ax.set_xlim(-5, 100)
Update:
In fact, after a quick Google search it seems that what I've come up with above is pretty much the same as the k-POD algorithm for K-means clustering of missing data (Chi, Chi & Baraniuk, 2016).
Here is a different algorithm that I use. Instead of replacing the missing values the values are ignored and in order to capture the differences between missing and non-missing i impliment missing dummies.
Compared to Alis algorithm it seems is easier for observations with missing observatons to jump from class to class. Since I do not fill the missing values.
I fortunely did not have the time to compare using Ali's beautiful code, but feel free to do it (I might do it when I get the time) and contribute to the discussion about the best method.
import numpy as np
class kmeans_missing(object):
def __init__(self,potential_centroids,n_clusters):
#initialize with potential centroids
self.n_clusters=n_clusters
self.potential_centroids=potential_centroids
def fit(self,data,max_iter=10,number_of_runs=1):
n_clusters=self.n_clusters
potential_centroids=self.potential_centroids
dist_mat=np.zeros((data.shape[0],n_clusters))
all_centroids=np.zeros((n_clusters,data.shape[1],number_of_runs))
costs=np.zeros((number_of_runs,))
for k in range(number_of_runs):
idx=np.random.choice(range(potential_centroids.shape[0]), size=(n_clusters), replace=False)
centroids=potential_centroids[idx]
clusters=np.zeros(data.shape[0])
old_clusters=np.zeros(data.shape[0])
for i in range(max_iter):
#Calc dist to centroids
for j in range(n_clusters):
dist_mat[:,j]=np.nansum((data-centroids[j])**2,axis=1)
#Assign to clusters
clusters=np.argmin(dist_mat,axis=1)
#Update clusters
for j in range(n_clusters):
centroids[j]=np.nanmean(data[clusters==j],axis=0)
if all(np.equal(clusters,old_clusters)):
break # Break when to change in clusters
if i==max_iter-1:
print('no convergence before maximal iterations are reached')
else:
clusters,old_clusters=old_clusters,clusters
all_centroids[:,:,k]=centroids
costs[k]=np.mean(np.min(dist_mat,axis=1))
self.costs=costs
self.cost=np.min(costs)
self.best_model=np.argmin(costs)
self.centroids=all_centroids[:,:,self.best_model]
self.all_centroids=all_centroids
def predict(self,data):
dist_mat=np.zeros((data.shape[0],self.n_clusters))
for j in range(self.n_clusters):
dist_mat[:,j]=np.nansum((data-self.centroids[j])**2,axis=1)
prediction=np.argmin(dist_mat,axis=1)
cost=np.min(dist_mat,axis=1)
return prediction,cost
Here is an example on how though it might be usefull.
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from kmeans_missing import *
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=2, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, X_hat = make_fake_data(fraction_missing=0.3, n_clusters=3, seed=0)
X_missing_dummies=np.isnan(X_hat)
n_clusters=3
X_hat = np.concatenate((X_hat,X_missing_dummies),axis=1)
kmeans_m=kmeans_missing(X_hat,n_clusters)
kmeans_m.fit(X_hat,max_iter=100,number_of_runs=10)
print(kmeans_m.costs)
prediction,cost=kmeans_m.predict(X_hat)
for i in range(n_clusters):
print([np.mean((prediction==i)*(true_labels==j)) for j in range(3)],np.mean((prediction==i)))
--EDIT--
In this example the occurrences of missing values are completly random and when that is the case. Not adding the missing value dummies preforms better, since missing value dummies in that case is noise. Not including them would also be the correct thing to do in order to compare with Ali's algorithm.

Categories