I want to have the legend of the plot shown with the value in a list. But what I get is the element index but not the value itself. I dont know how to fix it. I'm referring to the plt.plot line. Thanks for the help.
import matplotlib.pyplot as plt
import numpy as np
x = np.random.random(1000)
y = np.random.random(1000)
n = len(x)
d_ij = []
for i in range(n):
for j in range(i+1,n):
a = np.sqrt((x[i]-x[j])**2+(y[i]-y[j])**2)
d_ij.append(a)
epsilon = np.linspace(0.01,1,num=10)
sigma = np.linspace(0.01,1,num=10)
def lj_pot(epsi,sig,d):
result = []
for i in range(len(d)):
a = 4*epsi*((sig/d[i])**12-(sig/d[i])**6)
result.append(a)
return result
for i in range(len(epsilon)):
for j in range(len(sigma)):
a = epsilon[i]
b = sigma[j]
plt.cla()
plt.ylim([-1.5, 1.5])
plt.xlim([0, 2])
plt.plot(sorted(d_ij),lj_pot(epsilon[i],sigma[j],sorted(d_ij)),label = 'epsilon = %d, sigma =%d' %(a,b))
plt.legend()
plt.savefig("epsilon_%d_sigma_%d.png" % (i,j))
plt.show()
Your code is a bit unpythonic, so I tried to clean it up to the best of my knowledge. numpy.random.random and numpy.random.uniform(0, 1) are basically the same, however, the latter also allows you to pass the shape of the return array that you would like to have, in this case an array with 1000 rows and two columns (1000, 2). I then use some magic to assign the two colums of the return array to x and y in the same line, respectively.
numpy.hypot does as the name suggests and calculates the hypothenuse of x and y. It can also do that for each entry of arrays with the same size, saving you the for loops, which you should try to aviod in Python since they are pretty slow.
You used plt for all your plotting, which is fine as long as you only have one figure, but I would recommend to be as explicit as possible, according to one of Python's key notions:
explicit is better than implicit.
I recommend you read through this guide, in particular the section called 'Stateful Versus Stateless Approaches'. I changed your commands accordingly.
It is also very unpythonic to loop over items of a list using the index of the item in the list like you did (for i in range(len(list)): item = list[i]). You can just reference the item directly (for item in list:).
Lastly I changed your formatted strings to the more convenient f-strings. Have a read here.
import matplotlib.pyplot as plt
import numpy as np
def pot(epsi, sig, d):
result = 4*epsi*((sig/d)**12 - (sig/d)**6)
return result
# I am not sure why you would create the independent variable this way,
# maybe you are simulating something. In that case, the code below is
# simpler than your version and should achieve the same.
# x, y = zip(*np.random.uniform(0, 1, (1000, 2)))
# d = np.array(sorted(np.hypot(x, y)))
# If you only want to plot your pot function then creating the value range
# like this is just fine.
d = np.linspace(0.001, 1, 1000)
epsilons = sigmas = np.linspace(0.01, 1, num=10)
fig, ax = plt.subplots()
ax.set_xlim([0, 2])
ax.set_ylim([-1.5, 1.5])
line = None
for epsilon in epsilons:
for sigma in sigmas:
if line is None:
line = ax.plot(
d, pot(epsilon, sigma, d),
label=f'epsilon = {epsilon}, sigma = {sigma}'
)[0]
fig.legend()
else:
line.set_data(d, pot(epsilon, sigma, d))
# plt.savefig(f"epsilon_{epsilon}_sigma_{sigma}.png")
fig.show()
I am currently stuck on a problem on which I am required to generate a curve of best fit which I am required to use a more precise x array from 250 to 100 in steps of 10. Here is my code below so far..
import numpy as np
from numpy import polyfit, polyval
import matplotlib.pyplot as plt
x = [250,300,350,400,450,500,550,600,700,750,800,900,1000]
x = np.array(x)
y = [0.791, 0.846, 0.895, 0.939, 0.978, 1.014, 1.046, 1.075, 1.102, 1.148, 1.169, 1.204, 1.234]
y= np.array(y)
r = polyfit(x,y,3)
fit = polyval(r, x)
plt.plot(x, fit, 'b')
plt.plot(x,y, color = 'r', marker = 'x')
plt.show()
If I understand correctly, you are trying to create an array of numbers from a to b by steps of c.
With pure python you can use:
list(range(a, b, c)) #in your case list(range(250, 1000, 10))
Or, since you are using numpy you can directly make the numpy array:
np.arange(a, b, c)
To create an array in steps you can use numpy.arange([start,] stop[, step]):
import numpy as np
x = np.arange(250,1000,10)
To generate values from 250-1000, use range(start, stop, step):
x = range(250,1001,10)
x = np.array(x)
I am reading from a dataset which looks like the following when plotted in matplotlib and then taken the best fit curve using linear regression.
The sample of data looks like following:
# ID X Y px py pz M R
1.04826492772e-05 1.04828050287e-05 1.048233088e-05 0.000107002791008 0.000106552433081 0.000108704469007 387.02 4.81947797625e+13
1.87380963036e-05 1.87370588085e-05 1.87372620448e-05 0.000121616280029 0.000151924707761 0.00012371156585 428.77 6.54636174067e+13
3.95579877816e-05 3.95603773653e-05 3.95610756809e-05 0.000163470663023 0.000265203868883 0.000228031803626 470.74 8.66961875758e+13
My code looks the following:
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = sp.stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = sp.polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = M, Y_z # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
However I can see a lot upward scatter in the data and the best fit curve is affected by those. So first I want to isolate the data points which are 2 and 3 sigma away from my mean data, and mark them with circle around them.
Then take the best fit curve considering only the points which fall within 1 sigma of my mean data
Is there a good function in python which can do that for me?
Also in addition to that may I also isolate the data from my actual dataset, like if the third row in the sample input represents 2 sigma deviation may I have that row as an output too to save later and investigate more?
Your help is most appreciated.
Here's some code that goes through the data in a given number of windows, calculates statistics in said windows, and separates data in well- and misbehaved lists.
Hope this helps.
from scipy import stats
from scipy import polyval
import numpy as np
import matplotlib.pyplot as plt
num_data = 10000
fake_data_x = np.sort(12.8+np.random.random(num_data))
fake_data_y = np.exp(fake_data_x) + np.random.normal(0,scale=50000,size=num_data)
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = fake_data_x, fake_data_y # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.figure()
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
plt.show()
num_bin_intervals = 10 # approx number of averaging windows
window_boundaries = np.linspace(min(fake_data_x),max(fake_data_x),int(len(fake_data_x)/num_bin_intervals)) # window boundaries
y_good = [] # list to collect the "well-behaved" y-axis data
x_good = [] # list to collect the "well-behaved" x-axis data
y_outlier = []
x_outlier = []
for i in range(len(window_boundaries)-1):
# create a boolean mask to select the data within the averaging window
window_indices = (fake_data_x<=window_boundaries[i+1]) & (fake_data_x>window_boundaries[i])
# separate the pieces of data in the window
fake_data_x_slice = fake_data_x[window_indices]
fake_data_y_slice = fake_data_y[window_indices]
# calculate the mean y_value in the window
y_mean = np.mean(fake_data_y_slice)
y_std = np.std(fake_data_y_slice)
# choose and select the outliers
y_outliers = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
x_outliers = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
# choose and select the good ones
y_goodies = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
x_goodies = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
# extend the lists with all the good and the bad
y_good.extend(list(y_goodies))
y_outlier.extend(list(y_outliers))
x_good.extend(list(x_goodies))
x_outlier.extend(list(x_outliers))
plt.figure()
plt.semilogy(x_good,y_good,'o')
plt.semilogy(x_outlier,y_outlier,'r*')
plt.show()
I have a netcdf file containing global sea-surface temperatures. Using matplotlib and Basemap, I've managed to make a map of this data, with the following code:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
filename = '/Users/Nick/Desktop/SST/SST.nc'
fh = Dataset(filename, mode='r')
lons = fh.variables['LON'][:]
lats = fh.variables['LAT'][:]
sst = fh.variables['SST'][:].squeeze()
fig = plt.figure()
m = Basemap(projection='merc', llcrnrlon=80.,llcrnrlat=-25.,urcrnrlon=150.,urcrnrlat=25.,lon_0=115., lat_0=0., resolution='l')
lon, lat = np.meshgrid(lons, lats)
xi, yi = m(lon, lat)
cs = m.pcolormesh(xi,yi,sst, vmin=18, vmax=32)
m.drawmapboundary(fill_color='0.3')
m.fillcontinents(color='0.3', lake_color='0.3')
cbar = m.colorbar(cs, location='bottom', pad="10%", ticks=[18., 20., 22., 24., 26., 28., 30., 32.])
cbar.set_label('January SST (' + u'\u00b0' + 'C)')
plt.savefig('SST.png', dpi=300)
The problem is that the data is very high resolution (9km grid) which makes the resulting image quite noisy. I would like to put the data onto a lower resolution grid (e.g. 1 degree), but I'm struggling to work out how this could be done. I followed a worked solution to try and use the matplotlib griddata function by inserting the code below into my above example, but it resulted in 'ValueError: condition must be a 1-d array'.
xi, yi = np.meshgrid(lons, lats)
X = np.arange(min(x), max(x), 1)
Y = np.arange(min(y), max(y), 1)
Xi, Yi = np.meshgrid(X, Y)
Z = griddata(xi, yi, z, Xi, Yi)
I'm a relative beginner to Python and matplotlib, so I'm not sure what I'm doing wrong (or what a better approach might be). Any advice appreciated!
If you regrid your data to a coarser lat/lon grid using e.g. bilinear interpolation, this will result in a smoother field.
The NCAR ClimateData guide has a nice introduction to regridding (general, not Python-specific).
The most powerful implementation of regridding routines available for Python is, to my knowledge, the Earth System Modeling Framework (ESMF) Python interface (ESMPy). If this is a bit too involved for your application, you should look into
EarthPy tutorials on regridding (e.g. using Pyresample, cKDTree, or Basemap).
Turning your data into an Iris cube and using Iris' regridding functions.
Perhaps start by looking at the EarthPy regridding tutorial using Basemap, since you are using it already.
The way to do this in your example would be
from mpl_toolkits import basemap
from netCDF4 import Dataset
filename = '/Users/Nick/Desktop/SST/SST.nc'
with Dataset(filename, mode='r') as fh:
lons = fh.variables['LON'][:]
lats = fh.variables['LAT'][:]
sst = fh.variables['SST'][:].squeeze()
lons_sub, lats_sub = np.meshgrid(lons[::4], lats[::4])
sst_coarse = basemap.interp(sst, lons, lats, lons_sub, lats_sub, order=1)
This performs bilinear interpolation (order=1) on your SST data onto a sub-sampled grid (every fourth point). Your plot will look more coarse-grained afterwards. If you do not like that, interpolate back onto the original grid with e.g.
sst_smooth = basemap.interp(sst_coarse, lons_sub[0,:], lats_sub[:,0], *np.meshgrid(lons, lats), order=1)
I usually run my data through a Laplace filter for smoothing. Perhaps you could try the function below and see if it helps with your data. The function can be called with or without a mask (e.g land/ocean mask for ocean data points). Hope this helps. T
# Laplace filter for 2D field with/without mask
# M = 1 on - cells used
# M = 0 off - grid cells not used
# Default is without masking
import numpy as np
def laplace_X(F,M):
jmax, imax = F.shape
# Add strips of land
F2 = np.zeros((jmax, imax+2), dtype=F.dtype)
F2[:, 1:-1] = F
M2 = np.zeros((jmax, imax+2), dtype=M.dtype)
M2[:, 1:-1] = M
MS = M2[:, 2:] + M2[:, :-2]
FS = F2[:, 2:]*M2[:, 2:] + F2[:, :-2]*M2[:, :-2]
return np.where(M > 0.5, (1-0.25*MS)*F + 0.25*FS, F)
def laplace_Y(F,M):
jmax, imax = F.shape
# Add strips of land
F2 = np.zeros((jmax+2, imax), dtype=F.dtype)
F2[1:-1, :] = F
M2 = np.zeros((jmax+2, imax), dtype=M.dtype)
M2[1:-1, :] = M
MS = M2[2:, :] + M2[:-2, :]
FS = F2[2:, :]*M2[2:, :] + F2[:-2, :]*M2[:-2, :]
return np.where(M > 0.5, (1-0.25*MS)*F + 0.25*FS, F)
# The mask may cause laplace_X and laplace_Y to not commute
# Take average of both directions
def laplace_filter(F, M=None):
if M == None:
M = np.ones_like(F)
return 0.5*(laplace_X(laplace_Y(F, M), M) +
laplace_Y(laplace_X(F, M), M))
To answer your original question regarding scipy.interpolate.griddata, too:
Have a close look at the parameter specs for that function (e.g. in the SciPy documentation) and make sure that your input arrays have the right shapes. You might need to do something like
import numpy as np
points = np.vstack([a.flat for a in np.meshgrid(lons,lats)]).T # (n,D)
values = sst.ravel() # (n)
etc.
If you are working on Linux, you can achieve this using nctoolkit (https://nctoolkit.readthedocs.io/en/latest/).
You have not stated the latlon extent of your data, so I will assume it is a global dataset. Regridding to 1 degree resolution would require the following:
import nctoolkit as nc
filename = '/Users/Nick/Desktop/SST/SST.nc'
data = nc.open_data(filename)
data.to_latlon(lon = [-179.5, 179.5], lat = [-89.5, 89.5], res = [1,1])
# visualize the data
data.plot()
Look at this example with xarray...
use the ds.interp method and specify the new latitude and longitude values.
http://xarray.pydata.org/en/stable/interpolation.html#example
I have a (26424 x 144) array and I want to perform PCA over it using Python. However, there is no particular place on the web that explains about how to achieve this task (There are some sites which just do PCA according to their own - there is no generalized way of doing so that I can find). Anybody with any sort of help will do great.
I posted my answer even though another answer has already been accepted; the accepted answer relies on a deprecated function; additionally, this deprecated function is based on Singular Value Decomposition (SVD), which (although perfectly valid) is the much more memory- and processor-intensive of the two general techniques for calculating PCA. This is particularly relevant here because of the size of the data array in the OP. Using covariance-based PCA, the array used in the computation flow is just 144 x 144, rather than 26424 x 144 (the dimensions of the original data array).
Here's a simple working implementation of PCA using the linalg module from SciPy. Because this implementation first calculates the covariance matrix, and then performs all subsequent calculations on this array, it uses far less memory than SVD-based PCA.
(the linalg module in NumPy can also be used with no change in the code below aside from the import statement, which would be from numpy import linalg as LA.)
The two key steps in this PCA implementation are:
calculating the covariance matrix; and
taking the eivenvectors & eigenvalues of this cov matrix
In the function below, the parameter dims_rescaled_data refers to the desired number of dimensions in the rescaled data matrix; this parameter has a default value of just two dimensions, but the code below isn't limited to two but it could be any value less than the column number of the original data array.
def PCA(data, dims_rescaled_data=2):
"""
returns: data transformed in 2 dims/columns + regenerated original data
pass in: data as 2D NumPy array
"""
import numpy as NP
from scipy import linalg as LA
m, n = data.shape
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = NP.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
evals, evecs = LA.eigh(R)
# sort eigenvalue in decreasing order
idx = NP.argsort(evals)[::-1]
evecs = evecs[:,idx]
# sort eigenvectors according to same index
evals = evals[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
evecs = evecs[:, :dims_rescaled_data]
# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors
return NP.dot(evecs.T, data.T).T, evals, evecs
def test_PCA(data, dims_rescaled_data=2):
'''
test by attempting to recover original data array from
the eigenvectors of its covariance matrix & comparing that
'recovered' array with the original data
'''
_ , _ , eigenvectors = PCA(data, dim_rescaled_data=2)
data_recovered = NP.dot(eigenvectors, m).T
data_recovered += data_recovered.mean(axis=0)
assert NP.allclose(data, data_recovered)
def plot_pca(data):
from matplotlib import pyplot as MPL
clr1 = '#2026B2'
fig = MPL.figure()
ax1 = fig.add_subplot(111)
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
MPL.show()
>>> # iris, probably the most widely used reference data set in ML
>>> df = "~/iris.csv"
>>> data = NP.loadtxt(df, delimiter=',')
>>> # remove class labels
>>> data = data[:,:-1]
>>> plot_pca(data)
The plot below is a visual representation of this PCA function on the iris data. As you can see, a 2D transformation cleanly separates class I from class II and class III (but not class II from class III, which in fact requires another dimension).
You can find a PCA function in the matplotlib module:
import numpy as np
from matplotlib.mlab import PCA
data = np.array(np.random.randint(10,size=(10,3)))
results = PCA(data)
results will store the various parameters of the PCA.
It is from the mlab part of matplotlib, which is the compatibility layer with the MATLAB syntax
EDIT:
on the blog nextgenetics I found a wonderful demonstration of how to perform and display a PCA with the matplotlib mlab module, have fun and check that blog!
Another Python PCA using numpy. The same idea as #doug but that one didn't run.
from numpy import array, dot, mean, std, empty, argsort
from numpy.linalg import eigh, solve
from numpy.random import randn
from matplotlib.pyplot import subplots, show
def cov(X):
"""
Covariance matrix
note: specifically for mean-centered data
note: numpy's `cov` uses N-1 as normalization
"""
return dot(X.T, X) / X.shape[0]
# N = data.shape[1]
# C = empty((N, N))
# for j in range(N):
# C[j, j] = mean(data[:, j] * data[:, j])
# for k in range(j + 1, N):
# C[j, k] = C[k, j] = mean(data[:, j] * data[:, k])
# return C
def pca(data, pc_count = None):
"""
Principal component analysis using eigenvalues
note: this mean-centers and auto-scales the data (in-place)
"""
data -= mean(data, 0)
data /= std(data, 0)
C = cov(data)
E, V = eigh(C)
key = argsort(E)[::-1][:pc_count]
E, V = E[key], V[:, key]
U = dot(data, V) # used to be dot(V.T, data.T).T
return U, E, V
""" test data """
data = array([randn(8) for k in range(150)])
data[:50, 2:4] += 5
data[50:, 2:5] += 5
""" visualize """
trans = pca(data, 3)[0]
fig, (ax1, ax2) = subplots(1, 2)
ax1.scatter(data[:50, 0], data[:50, 1], c = 'r')
ax1.scatter(data[50:, 0], data[50:, 1], c = 'b')
ax2.scatter(trans[:50, 0], trans[:50, 1], c = 'r')
ax2.scatter(trans[50:, 0], trans[50:, 1], c = 'b')
show()
Which yields the same thing as the much shorter
from sklearn.decomposition import PCA
def pca2(data, pc_count = None):
return PCA(n_components = 4).fit_transform(data)
As I understand it, using eigenvalues (first way) is better for high-dimensional data and fewer samples, whereas using Singular value decomposition is better if you have more samples than dimensions.
This is a job for numpy.
And here's a tutorial demonstrating how pincipal component analysis can be done using numpy's built-in modules like mean,cov,double,cumsum,dot,linalg,array,rank.
http://glowingpython.blogspot.sg/2011/07/principal-component-analysis-with-numpy.html
Notice that scipy also has a long explanation here
- https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105
with the scikit-learn library having more code examples -
https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105
Here are scikit-learn options. With both methods, StandardScaler was used because PCA is effected by scale
Method 1: Have scikit-learn choose the minimum number of principal components such that at least x% (90% in example below) of the variance is retained.
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
# mean-centers and auto-scales the data
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(.90)
principalComponents = pca.fit_transform(X = standardizedData)
# To get how many principal components was chosen
print(pca.n_components_)
Method 2: Choose the number of principal components (in this case, 2 was chosen)
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X = standardizedData)
# to get how much variance was retained
print(pca.explained_variance_ratio_.sum())
Source: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
UPDATE: matplotlib.mlab.PCA is since release 2.2 (2018-03-06) indeed deprecated.
The library matplotlib.mlab.PCA (used in this answer) is not deprecated. So for all the folks arriving here via Google, I'll post a complete working example tested with Python 2.7.
Use the following code with care as it uses a now deprecated library!
from matplotlib.mlab import PCA
import numpy
data = numpy.array( [[3,2,5], [-2,1,6], [-1,0,4], [4,3,4], [10,-5,-6]] )
pca = PCA(data)
Now in `pca.Y' is the original data matrix in terms of the principal components basis vectors. More details about the PCA object can be found here.
>>> pca.Y
array([[ 0.67629162, -0.49384752, 0.14489202],
[ 1.26314784, 0.60164795, 0.02858026],
[ 0.64937611, 0.69057287, -0.06833576],
[ 0.60697227, -0.90088738, -0.11194732],
[-3.19578784, 0.10251408, 0.00681079]])
You can use matplotlib.pyplot to draw this data, just to convince yourself that the PCA yields "good" results. The names list is just used to annotate our five vectors.
import matplotlib.pyplot
names = [ "A", "B", "C", "D", "E" ]
matplotlib.pyplot.scatter(pca.Y[:,0], pca.Y[:,1])
for label, x, y in zip(names, pca.Y[:,0], pca.Y[:,1]):
matplotlib.pyplot.annotate( label, xy=(x, y), xytext=(-2, 2), textcoords='offset points', ha='right', va='bottom' )
matplotlib.pyplot.show()
Looking at our original vectors we'll see that data[0] ("A") and data[3] ("D") are rather similar as are data[1] ("B") and data[2] ("C"). This is reflected in the 2D plot of our PCA transformed data.
In addition to all the other answers, here is some code to plot the biplot using sklearn and matplotlib.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
pca = PCA()
x_new = pca.fit_transform(X)
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
I've made a little script for comparing the different PCAs appeared as an answer here:
import numpy as np
from scipy.linalg import svd
shape = (26424, 144)
repeat = 20
pca_components = 2
data = np.array(np.random.randint(255, size=shape)).astype('float64')
# data normalization
# data.dot(data.T)
# (U, s, Va) = svd(data, full_matrices=False)
# data = data / s[0]
from fbpca import diffsnorm
from timeit import default_timer as timer
from scipy.linalg import svd
start = timer()
for i in range(repeat):
(U, s, Va) = svd(data, full_matrices=False)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('svd time: %.3fms, error: %E' % (time*1000/repeat, err))
from matplotlib.mlab import PCA
start = timer()
_pca = PCA(data)
for i in range(repeat):
U = _pca.project(data)
time = timer() - start
err = diffsnorm(data, U, _pca.fracs, _pca.Wt)
print('matplotlib PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
from fbpca import pca
start = timer()
for i in range(repeat):
(U, s, Va) = pca(data, pca_components, True)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('facebook pca time: %.3fms, error: %E' % (time*1000/repeat, err))
from sklearn.decomposition import PCA
start = timer()
_pca = PCA(n_components = pca_components)
_pca.fit(data)
for i in range(repeat):
U = _pca.transform(data)
time = timer() - start
err = diffsnorm(data, U, _pca.explained_variance_, _pca.components_)
print('sklearn PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_mark(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s, Va.T)
print('pca by Mark time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_doug(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s[:pca_components], Va.T)
print('pca by doug time: %.3fms, error: %E' % (time*1000/repeat, err))
pca_mark is the pca in Mark's answer.
pca_doug is the pca in doug's answer.
Here is an example output (but the result depends very much on the data size and pca_components, so I'd recommend to run your own test with your own data. Also, facebook's pca is optimized for normalized data, so it will be faster and more accurate in that case):
svd time: 3212.228ms, error: 1.907320E-10
matplotlib PCA time: 879.210ms, error: 2.478853E+05
facebook pca time: 485.483ms, error: 1.260335E+04
sklearn PCA time: 169.832ms, error: 7.469847E+07
pca by Mark time: 293.758ms, error: 1.713129E+02
pca by doug time: 300.326ms, error: 1.707492E+02
EDIT:
The diffsnorm function from fbpca calculates the spectral-norm error of a Schur decomposition.
This will may be the simplest answer one can find for the PCA including easily understandable steps. Let say we want to retain 2 principal dimensions from the 144 which provides maximum information.
Firstly, convert your 2-D array to a dataframe:
import pandas as pd
# Here X is your array of size (26424 x 144)
data = pd.DataFrame(X)
Then, there are two methods one can go with:
Method 1: Manual calculation
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Find Co-variance matrix S of original matrix X
sample_data = standardized_data
covar_matrix = np.cov(sample_data)
Step 3: Find eigen values and eigen vectors of S (here 2D, so 2 of each)
from scipy.linalg import eigh
# eigh() function will provide eigen-values and eigen-vectors for a given matrix.
# eigvals=(low value, high value) takes eigen value numbers in ascending order
values, vectors = eigh(covar_matrix, eigvals=(142,143))
# Converting the eigen vectors into (2,d) shape for easyness of further computations
vectors = vectors.T
Step 4: Transform the data
# Projecting the original data sample on the plane formed by two principal eigen vectors by vector-vector multiplication.
new_coordinates = np.matmul(vectors, sample_data.T)
print(new_coordinates.T)
This new_coordinates.T will be of size (26424 x 2) with 2 principal components.
Method 2: Using Scikit-Learn
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Initializing the pca
from sklearn import decomposition
# n_components = numbers of dimenstions you want to retain
pca = decomposition.PCA(n_components=2)
Step 3: Using pca to fit the data
# This line takes care of calculating co-variance matrix, eigen values, eigen vectors and multiplying top 2 eigen vectors with data-matrix X.
pca_data = pca.fit_transform(sample_data)
This pca_data will be of size (26424 x 2) with 2 principal components.
For the sake def plot_pca(data): will work, it is necessary to replace the lines
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
with lines
newData, data_resc, data_orig = PCA(data)
ax1.plot(newData[:, 0], newData[:, 1], '.', mfc=clr1, mec=clr1)
this sample code loads the Japanese yield curve, and creates PCA components.
It then estimates a given date's move using the PCA and compares it against the actual move.
%matplotlib inline
import numpy as np
import scipy as sc
from scipy import stats
from IPython.display import display, HTML
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import quandl as ql
start = "2016-10-04"
end = "2019-10-04"
ql_data = ql.get("MOFJ/INTEREST_RATE_JAPAN", start_date = start, end_date = end).sort_index(ascending= False)
eigVal_, eigVec_ = np.linalg.eig(((ql_data[:300]).diff(-1)*100).cov()) # take latest 300 data-rows and normalize to bp
print('number of PCA are', len(eigVal_))
loc_ = 10
plt.plot(eigVec_[:,0], label = 'PCA1')
plt.plot(eigVec_[:,1], label = 'PCA2')
plt.plot(eigVec_[:,2], label = 'PCA3')
plt.xticks(range(len(eigVec_[:,0])), ql_data.columns)
plt.legend()
plt.show()
x = ql_data.diff(-1).iloc[loc_].values * 100 # set the differences
x_ = x[:,np.newaxis]
a1, _, _, _ = np.linalg.lstsq(eigVec_[:,0][:, np.newaxis], x_) # linear regression without intercept
a2, _, _, _ = np.linalg.lstsq(eigVec_[:,1][:, np.newaxis], x_)
a3, _, _, _ = np.linalg.lstsq(eigVec_[:,2][:, np.newaxis], x_)
pca_mv = m1 * eigVec_[:,0] + m2 * eigVec_[:,1] + m3 * eigVec_[:,2] + c1 + c2 + c3
pca_MV = a1[0][0] * eigVec_[:,0] + a2[0][0] * eigVec_[:,1] + a3[0][0] * eigVec_[:,2]
pca_mV = b1 * eigVec_[:,0] + b2 * eigVec_[:,1] + b3 * eigVec_[:,2]
display(pd.DataFrame([eigVec_[:,0], eigVec_[:,1], eigVec_[:,2], x, pca_MV]))
print('PCA1 regression is', a1, a2, a3)
plt.plot(pca_MV)
plt.title('this is with regression and no intercept')
plt.plot(ql_data.diff(-1).iloc[loc_].values * 100, )
plt.title('this is with actual moves')
plt.show()