In SelectKBest, what does length of get_support() represent? - python

When reproducing this cross-validation example, I get for a 2x4 train matrix (xtrain) a len(b.get_support()) of 1 000 000. Does this mean 1 000 000 features have been created in the model? Or only 2, as the number of features that have an impact is 2. Thanks!
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
### create data
def hidden_model(x):
#y is a linear combination of columns 5 and 10...
result = x[:, 5] + x[:, 10]
#... with a little noise
result += np.random.normal(0, .005, result.shape)
return result
def make_x(nobs):
return np.random.uniform(0, 3, (nobs, 10 ** 6))
x = make_x(20)
y = hidden_model(x)
scores = []
clf = LinearRegression()
for train, test in KFold(len(y), n_folds=5):
xtrain, xtest, ytrain, ytest = x[train], x[test], y[train], y[test]
b = SelectKBest(f_regression, k=2)
b.fit(xtrain,ytrain)
xtrain = xtrain[:, b.get_support()] #get_support: get mask or integer index of selected features
xtest = xtest[:, b.get_support()]
print len(b.get_support())
clf.fit(xtrain, ytrain)
scores.append(clf.score(xtest, ytest))
yp = clf.predict(xtest)
plt.plot(yp, ytest, 'o')
plt.plot(ytest, ytest, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')
print("CV Score (R_square) is", np.mean(scores))

It represents the mask that can be applied to your x to get the features that have been selected using the SelectKBest routine.
print x.shape
print b.get_support().shape
print np.bincount(b.get_support())
Outputs:
(20, 1000000)
(1000000,)
[999998 2]
Which shows you have 20 examples of 1000000 dimensional data, a boolean array of length 1000000 of which only two are ones.
Hope that helps!

Related

ndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

The programs runs the graph however it doesn't print the output results in the console for it detects an error found in line 103 which says "IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices" How do I fix this?
This code performs the classification of heart disease by separating the predicted values in two sets, namely 0 for absence and 1 for presence where all the predicted values between 1 and 4 are replaced to 1 to check the model performance
from numpy import genfromtxt
import numpy as np
import matplotlib
matplotlib.use('TKAgg')
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
from sklearn import cross_validation
from sklearn.svm import SVC
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.backends.backend_qt5agg import NavigationToolbar2QT as NavigationToolbar
import matplotlib.pyplot as plt
#Loading and pruning the data
dataset = genfromtxt('cleveland_data.csv',dtype = float, delimiter=',')
#print dataset
X = dataset[:,0:12] #Feature Set
y = dataset[:,13] #Label Set
#Replacing 1-4 by 1 label
for index, item in enumerate(y):
if not (item == 0.0):
y[index] = 1
print(y)
target_names = ['0', '1']
#Method to plot the graph for reduced Dimesions
def plot_2D(data, target, target_names):
colors = cycle('rgbcmykw')
target_ids = range(len(target_names))
plt.figure()
for i, c, label in zip(target_ids, colors, target_names):
plt.scatter(data[target == i, 0], data[target == i, 1],
c=c, label=label)
plt.legend()
plt.savefig('Problem 2 Graph')
# Classifying the data using a Linear SVM and predicting the probability of disease belonging to a particular class
modelSVM = LinearSVC(C=0.001)
pca = PCA(n_components=5, whiten=True).fit(X)
X_new = pca.transform(X)
# calling plot_2D
plot_2D(X_new, y, target_names)
#Applying cross validation on the training and test set for validating our Linear SVM Model
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y, test_size=0.2, train_size=0.8, random_state=0)
modelSVM = modelSVM.fit(X_train, y_train)
print("Linear SVC values with split")
print(modelSVM.score(X_test, y_test))
modelSVMRaw = LinearSVC(C=0.001)
modelSVMRaw = modelSVMRaw.fit(X_new, y)
cnt = 0
for i in modelSVMRaw.predict(X_new):
if i == y[i]:
cnt = cnt+1
print("Linear SVC score without split")
print(int(cnt)//303)
#Applying the Principal Component Analysis on the data features
modelSVM2 = SVC(C=0.001,kernel='rbf')
#Applying cross validation on the training and test set for validating our Linear SVM Model
X_train1, X_test1, y_train1, y_test1 = cross_validation.train_test_split(X_new, y, test_size=0.2, train_size=0.8, random_state=0)
modelSVM2 = modelSVM2.fit(X_train1, y_train1)
print("RBF score with split")
print(modelSVM2.score(X_test1, y_test1))
modelSVM2Raw = SVC(C=0.001,kernel='rbf')
modelSVM2Raw = modelSVM2Raw.fit(X_new, y)
cnt1 = 0
for i in modelSVM2Raw.predict(X_new):
if i == y[i]:
cnt1 = cnt1+1
print("RBF score without split")
print(float(cnt1)//303)
#Using Stratified K Fold
skf = cross_validation.StratifiedKFold(y, n_folds=5)
for train_index, test_index in skf:
# print("TRAIN:", train_index, "TEST:", test_index)
X_train3, X_test3 = X[train_index], X[test_index]
y_train3, y_test3 = y[train_index], y[test_index]
modelSVM3 = SVC(C=0.001,kernel='rbf')
modelSVM3 = modelSVM3.fit(X_train3, y_train3)
print("Stratified K fold score")
print(modelSVM3.score(X_test3, y_test3))
modelSVM3Raw = SVC(C=0.001,kernel='rbf')
modelSVM3Raw = modelSVM3Raw.fit(X_new, y)
cnt2 = 0
for i in modelSVM3Raw.predict(X_new):
if i == y[i]:
cnt2 = cnt2+1
print("On PCA valued X_new")
print(float(cnt2)//303)
#Text interpretation
fig = plt.figure(figsize=(5, 1.5))
t = fig.text(0.5, 0.5, 'Problem 1 \nTesting Linear SVC values using Split \n0.5491803278688525 \nTesting with RBF using split \n0.4918032786885246 \nTesting using stratified with K folds \n0.5423728813559322',
ha='center', va='center', size=15)
text.set_path_effects([path_effects.path_2d()])
plt.show()
The program only runs the graph but doesn't show its results. I expect the console to print the results/interpretation of the SVM models and their values.

Expected 2D array error not getting resolved

i am trying to use my machine learning model on dataset where i have only two columns while standard scaling them,i got the error expected 2D array but got 1 .
Below is the code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)
# Predicting a new result
y_pred = regressor.predict(6.5)
y_pred = sc_y.inverse_transform(y_pred)
# Visualising the SVR results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
when i try to put
y = sc_y.fit_transform([y])
like this i received no error but when i execute next 3 lines i receive another error.
which is bad input shape (1, 10)
can anyone help me on this?
The StandardScaler() function in sklearn expects the input(X) to be in the following format:
X : numpy array of shape [n_samples, n_features]
So, reshape X to (-1,1) if you have only one feature column.
sc_X.fit_transform(X.reshape[-1,1])
This should work!

Regression with Python (nympy/pandas) [duplicate]

I have the following variables:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
def part1_scatter():
%matplotlib notebook
plt.figure()
plt.scatter(X_train, y_train, label='training data')
plt.scatter(X_test, y_test, label='test data')
plt.legend(loc=4);
And the following question:
Write a function that fits a polynomial LinearRegression model on the training data X_train for degrees 1, 3, 6, and 9. (Use PolynomialFeatures in sklearn.preprocessing to create the polynomial features and then fit a linear regression model) For each model, find 100 predicted values over the interval x = 0 to 10 (e.g. np.linspace(0,10,100)) and store this in a numpy array. The first row of this array should correspond to the output from the model trained on degree 1, the second row degree 3, the third row degree 6, and the fourth row degree 9.
This is my code, but it don't work out:
def answer_one():
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
results = []
pred_data = np.linspace(0,10,100)
degree = [1,3,6,9]
y_train1 = y_train.reshape(-1,1)
for i in degree:
poly = PolynomialFeatures(degree=i)
pred_poly1 = poly.fit_transform(pred_data[:,np.newaxis])
X_F1_poly = poly.fit_transform(X_train[:,np.newaxis])
linreg = LinearRegression().fit(X_F1_poly, y_train1)
pred = linreg.predict(pred_poly1)
results.append(pred)
dataArray = np.array(results).reshape(4, 100)
return dataArray
I receive this error:
line 58 for i
in degree: ^ IndentationError: unexpected
indent
Could you tell me where the problem is?
The return statement should be performed after the for is done, so it should be indented under the for, not further in.
At the start of your line
n = 15
You stopped with identing. So that part isn't recognized as the function. This can be solved by putting 4 spaces on all lines from n = 15 onwards.

How to find the best degree of polynomials?

I'm new to Machine Learning and currently got stuck with this.
First I use linear regression to fit the training set but get very large RMSE. Then I tried using polynomial regression to reduce the bias.
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y)
poly_predict = poly_reg.predict(X_poly)
poly_mse = mean_squared_error(X, poly_predict)
poly_rmse = np.sqrt(poly_mse)
poly_rmse
Then I got slightly better result than linear regression, then I continued to set degree = 3/4/5, the result kept getting better. But it might be somewhat overfitting as degree increased.
The best degree of polynomial should be the degree that generates the lowest RMSE in cross validation set. But I don't have any idea how to achieve that. Should I use GridSearchCV? or any other method?
Much appreciate if you could me with this.
You should provide the data for X/Y next time, or something dummy, it'll be faster and provide you with a specific solution. For now I've created a dummy equation of the form y = X**4 + X**3 + X + 1.
There are many ways you can improve on this, but a quick iteration to find the best degree is to simply fit your data on each degree and pick the degree with the best performance (e.g., lowest RMSE).
You can also play with how you decide to hold out your train/test/validation data.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
X = np.arange(100).reshape(100, 1)
y = X**4 + X**3 + X + 1
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
rmses = []
degrees = np.arange(1, 10)
min_rmse, min_deg = 1e10, 0
for deg in degrees:
# Train features
poly_features = PolynomialFeatures(degree=deg, include_bias=False)
x_poly_train = poly_features.fit_transform(x_train)
# Linear regression
poly_reg = LinearRegression()
poly_reg.fit(x_poly_train, y_train)
# Compare with test data
x_poly_test = poly_features.fit_transform(x_test)
poly_predict = poly_reg.predict(x_poly_test)
poly_mse = mean_squared_error(y_test, poly_predict)
poly_rmse = np.sqrt(poly_mse)
rmses.append(poly_rmse)
# Cross-validation of degree
if min_rmse > poly_rmse:
min_rmse = poly_rmse
min_deg = deg
# Plot and present results
print('Best degree {} with RMSE {}'.format(min_deg, min_rmse))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(degrees, rmses)
ax.set_yscale('log')
ax.set_xlabel('Degree')
ax.set_ylabel('RMSE')
This will print:
Best degree 4 with RMSE 1.27689038706e-08
Alternatively, you could also build a new class that carries out Polynomial fitting, and pass that to GridSearchCV with a set of parameters.
In my opinion, the best way to find an optimal curve fitting degree or in general a fitting model is to use the GridSearchCV module from the scikit-learn library.
Here is an example how to use this library:
Firstly let us define a method to sample random data:
def make_data(N, err=1.0, rseed=1):
rng = np.random.RandomState(rseed)
X = rng.rand(N, 1) ** 2
y = 1. / (X.ravel() + 0.3)
if err > 0:
y += err * rng.randn(N)
return X, y
Build a pipeline:
def PolynomialRegression(degree=2, **kwargs):
return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
Create a data and a vector(X_test) for testing and visualisation purposes:
X, y = make_data(200)
X_test = np.linspace(-0.1, 1.1, 200)[:, None]
Define the GridSearchCV parameters:
param_grid = {'polynomialfeatures__degree': np.arange(20),
'linearregression__fit_intercept': [True, False],
'linearregression__normalize': [True, False]}
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)
grid.fit(X, y)
Get the best parameters from our model:
model = grid.best_estimator_
model
Pipeline(memory=None,
steps=[('polynomialfeatures', PolynomialFeatures(degree=4, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])
Fit the model with the X and y data and use the vector to predict the values:
y_test = model.fit(X, y).predict(X_test)
Visualize the result:
plt.scatter(X, y)
plt.plot(X_test.ravel(), y_test, 'r')
The best fit result
The full code snippet:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
def make_data(N, err=1.0, rseed=1):
rng = np.random.RandomState(rseed)
X = rng.rand(N, 1) ** 2
y = 1. / (X.ravel() + 0.3)
if err > 0:
y += err * rng.randn(N)
return X, y
def PolynomialRegression(degree=2, **kwargs):
return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
X, y = make_data(200)
X_test = np.linspace(-0.1, 1.1, 200)[:, None]
param_grid = {'polynomialfeatures__degree': np.arange(20),
'linearregression__fit_intercept': [True, False],
'linearregression__normalize': [True, False]}
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)
grid.fit(X, y)
model = grid.best_estimator_
y_test = model.fit(X, y).predict(X_test)
plt.scatter(X, y)
plt.plot(X_test.ravel(), y_test, 'r')
This is where Bayesian model selection comes in really. This gives you the most likely model given both model complexity and data fit. I'm super tired so the quick answer is to use the BIC (Bayesian information criterion):
k = number of variables in the model
n = number of observations
sse = sum(residuals**2)
BIC = n*ln(sse/n) + k*ln(n)
This BIC (or AIC etc) will give you the best model

how to solve ? x and y must have same first dimension

from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
r = pd.read_csv("vitalsign_test.csv")
clm_list = []
for column in r.columns:
clm_list.append(column)
X = r[clm_list[1:len(clm_list)-1]].values
y = r[clm_list[len(clm_list)-1]].values
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size = 0.3, random_state=4)
k_range = range(1,25)
scores = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
plt.plot(k_range,scores)
plt.xlabel('value of k for clf')
plt.ylabel('testing accuracy')
reponse that I am getting is
ValueError: x and y must have same first dimension
my feature and response shape is:
y.shape
Out[60]: (500,)
X.shape
Out[61]: (500, 6)
It has nothing to do with your X and y, it is about x and y arguments to plot, since your scores has one element, and k_range has 25. The error is incorrect indentation:
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
should be
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))

Categories