Translate cross_validation algorithm to model_selection - python

In 2016, I ran a lasso regression model using the code below:
#Import required packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.pyplot as plp
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.4, random_state=123)
#%
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)
#%
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
#regcoef.to_csv('variable+regresscoef.csv')
#%%
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
#%
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
#%
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
#%
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Now I want to run it again and got the following warning:
DeprecationWarning: This module was deprecated in version 0.18 in
favor of the model_selection module into which all the refactored
classes and functions are moved.
How can I rewrite this code using model_selection ?

Only thing I can see here that used cross_validation module earlier is train_test_split.
So just change your import from:
from sklearn.cross_validation import train_test_split
to:
from sklearn.model_selection import train_test_split
and you are good to go.

Related

ConfusionMatrixDisplay (Scikit-Learn) plot labels out of range

The following code plots a confusion matrix:
from sklearn.metrics import ConfusionMatrixDisplay
confusion_matrix = confusion_matrix(y_true, y_pred)
target_names = ["aaaaa", "bbbbbb", "ccccccc", "dddddddd", "eeeeeeeeee", "ffffffff", "ggggggggg"]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.savefig("conf.png")
There are two problems with this plot.
The y-axis label is cut off (True Label). The x label is cut off too.
The names are to long for the x-axis.
To solve the first problem I tried to use poof(bbox_inches='tight') which is unfortunately not available for sklearn.
In the second case I tried the following solution for 2. which lead to a completely distorted plot.
All in all I'm struggeling with both problems.
I think the easiest way would be to switch into tight_layout and add pad_inches= something.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from numpy.random import default_rng
rand = default_rng()
y_true = rand.integers(low=0, high=7, size=500)
y_pred = rand.integers(low=0, high=7, size=500)
confusion_matrix = confusion_matrix(y_true, y_pred)
target_names = ["aaaaa", "bbbbbb", "ccccccc", "dddddddd", "eeeeeeeeee", "ffffffff", "ggggggggg"]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.tight_layout()
plt.savefig("conf.png", pad_inches=5)
Result:

Polynomial Regression plot not showing correctly

I run this code for polynomial regression using sklearn but my plot is not what i was expecting. As you can see here i'm not getting a smooth line but it's jumping from one point to another. From my understanding i have to sort X, but when i do that all i get is an empty plot with a linear line.
import operator
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf
df = pd.read_csv('D:\Mall_Customers.csv', usecols = ['Age', 'Annual Income (k$)','Spending Score (1-100)'])
x = StandardScaler().fit_transform(df)
kmeans = KMeans(n_clusters=3, max_iter=100)
y_kmeans= kmeans.fit_predict(x)
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}
dictlist = []
for key, value in mydict.items():
temp = [key,value]
dictlist.append(temp)
df0 = df[df.index.isin(mydict[0].tolist())]
X = df0[['Age', 'Annual Income (k$)']]
Y = df0['Spending Score (1-100)']
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)
model = LinearRegression()
model.fit(X_poly, Y)
y_poly_pred = model.predict(X_poly)
r2 = r2_score(Y,y_poly_pred)
print(r2)
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression(fit_intercept = False))
model.fit(X,Y)
plt.scatter(X.iloc[:, 1], Y, color='red')
plt.plot(X, Y, color='blue')
plt.xlabel('Age. Annual income')
plt.ylabel('Spending Score')
plt.show()
TLDR; the data is not linear dependent.
The reason the graph got so messy is because you plotted the X (train data) with the Y (the actual prediction data) and the fact that you were plotting this data while:
the data was messy and not really linear dependent
is what made the result this messy graph.
I suggest you to:
split to the train data into train, test and then after you train the model check the error with the test and maybe create 2 plots, 1 with the model results according to the test data and one with the actual result for the test data.
and change plot code to this:
.
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

My Python linear regression has incomplete line

I am trying to plot a linear regression. But the line is incomeplete. Below is my Python code:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
features = [[2],[4],[8],[5]]
labels = [320, 610, 1190, 726]
plt.scatter(features,labels,color="black")
plt.xlabel("Number of room")
plt.ylabel('price')
clf = linear_model.LinearRegression()
clf=clf.fit(features,labels)
result = clf.predict([[11],[9]])
plt.plot([[11],[9]], result, color='blue', linewidth=3)
print(result)
plt.show()
And the picture of the plot is given below:
From the picture, you can see that the line is incomplete. Please help me to solve this problem. Make the line complete with all the other values.

AttributeError: 'OLSResults' object has no attribute 'norm_resid'

When I run this I have the following error :
AttributeError: 'OLSResults' object has no attribute 'norm_resid'
I have the latest version of OLS, so the attribute norm_resid should be there.
Any ideas ?
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.norm_resid())
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')
I had the same issue, but the following worked:
plt.hist(model.resid_pearson)
Thus your solution should look like:
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.resid_pearson)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')
when using statsmodel version 0.8.0 or greater.
Note: the pearson residuals only divide each residual value with standard error of residuals. While normalisation also divides each residual by the sum of all residuals. For more see here
From the docs.

messy scatter plot regression line: Python

In python 2.7.6, matlablib, scikit learn 0.17.0, When I make a polynomial regression lines on a scatter plot, the polynomial curve will be really messy like this:
The script is like this: it will read two columns of floating data and make a scatter plot and regression
import pandas as pd
import scipy.stats as stats
import pylab
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pylab as pl
import sklearn
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
df=pd.read_csv("boston_real_estate_market_clean.csv")
LSTAT = df['LSTAT'].as_matrix()
LSTAT=LSTAT.reshape(LSTAT.shape[0], 1)
MEDV=df['MEDV'].as_matrix()
MEDV=MEDV.reshape(MEDV.shape[0], 1)
# Train test set split
X_train1, X_test1, y_train1, y_test1 = train_test_split(LSTAT,MEDV,test_size=0.3,random_state=1)
# Ploynomial Regression-nst order
plt.scatter(X_test1, y_test1, s=10, alpha=0.3)
for degree in [1,2,3,4,5]:
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(X_train1,y_train1)
y_plot = model.predict(X_test1)
plt.plot(X_test1, y_plot, label="degree %d" % degree
+'; $q^2$: %.2f' % model.score(X_train1, y_train1)
+'; $R^2$: %.2f' % model.score(X_test1, y_test1))
plt.legend(loc='upper right')
plt.show()
I guess the reason is because the "X_test1, y_plot" are not sorted properly?
X_test1 is a numpy array like this:
[[ 5.49]
[ 16.65]
[ 17.09]
....
[ 25.68]
[ 24.39]]
yplot is a numpy array like this:
[[ 29.78517812]
[ 17.16759833]
[ 16.86462359]
[ 23.18680265]
...[ 37.7631725 ]]
I try to sort with this:
[X_test1, y_plot] = zip(*sorted(zip(X_test1, y_plot), key=lambda y_plot: y_plot[0]))
plt.plot(X_test1, y_plot, label="degree %d" % degree
+'; $q^2$: %.2f' % model.score(X_train1, y_train1)
+'; $R^2$: %.2f' % model.score(X_test1, y_test1))
The curve looks normal now but the result is weird with a negative R^2.
Could any guru show me the real issue is or how to sort here properly? Thank you!
While the plot is now correct, you messed up the pairing of X_test1 to y_test1 while sorting because you forgot to also sort y_test1 in the same way.
The best solution is to sort right after the split. Then y_plot, which is computed later, will be automatically correct: (Here untested example using numpy as np)
X_train1, X_test1, y_train1, y_test1 = train_test_split(LSTAT,MEDV,test_size=0.3,random_state=1)
sorted_index = np.argsort(X_test1)
X_test1 = X_test1[sorted_index]
y_test1 = y_test1[sorted_index]

Categories