assign looped regression rsquared to object - python

I have a function with a regression loop built in. I want to assign the rsquareds from each iteration to an object that I can print out later.
here's part of the function (including the regression) for brevity:
cuts = [stats, stats_po, stats_ic, stats_id, stats_h, stats_a, stats_bos, stats_bkn, stats_nyk, stats_phi, stats_tor, stats_chi, stats_cle, stats_det, stats_ind, stats_mil, stats_den, stats_min, stats_okc, stats_por, stats_uta, stats_gsw, stats_lac, stats_lal, stats_phx, stats_sac, stats_atl, stats_cha, stats_mia, stats_orl, stats_was, stats_dal, stats_hou, stats_mem, stats_nop, stats_sas, stats_o1, stats_o2, stats_d1, stats_d2, stats_l25]
def process_cuts(c):
c = c.dropna(axis=0,how='all')
n = c.team.str.rsplit(" ",n=1, expand=True)
c['city'] = n[0]
c['team_name']=n[1]
c['team_name']=c['team_name'].str.replace('Trailblazers','Blazers')
c['team_name']=c['team_name'].str.replace('Bobcats','Hornets')
for z in ['Points','Steals','Blocks','Assists','OReb','DefReb','Turnovers','FieldGoals','ThreeShots','FTP', 'Fouls','FTMiss','FGMiss','FreeThrows']:
y = mergered[z]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
print(coeff_df)
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
from sklearn import metrics
rsquared = 'Rsquared:' + ' ' + z, metrics.r2_score(y_test,y_pred)
cuts_diffs = list(map(process_cuts, cuts))
I want to store the rsquareds for each y and print them out for each data cut.
appreciate your help

Related

how to change my code to use k fold cross validation with k = 5

I want to change my code so that instead of this part:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.2)
train_data = X_train.copy()
train_data.loc[:, 'target'] = y_train
test_data = X_test.copy()
test_data.loc[:, 'target'] = y_test
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for
regression. Multi-Task Classification is not implemented
continuous_cols=train_data.columns.tolist(),
categorical_cols=[],
normalize_continuous_features=True
)
trainer_config = TrainerConfig(
auto_lr_find=True,
batch_size=64,
max_epochs=10,
)
optimizer_config = {'optimizer':'Adam', 'optimizer_params':{'weight_decay': 0, 'amsgrad':
False}, 'lr_scheduler':None, 'lr_scheduler_params':{},
'lr_scheduler_monitor_metric':'valid_loss'}
model_config = NodeConfig(
task="classification",
num_layers=2,
num_trees=512,
learning_rate=1,
embed_categorical=True,
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
tabular_model.fit(train=train_data, test=test_data)
pred = tabular_model.predict(test_data)
pred['prediction'] = pred['prediction'].astype(int)
pred.loc[(pred['prediction'] >= 1 )] = 1
print_metrics(test_data['target'], pred["prediction"].astype('int'), tag="Holdout")
I want to Use the K fold method with k = 5 or 10.
Thank you for your advice.
The complete code example that I have used method train_test_split is above.
Here is an example of the k-fold method:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=0)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)
result (in this example):
0.9666666666666667
The example is from here: https://scikit-learn.org/stable/modules/cross_validation.html

How can I get the final tree model?

Given this model:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import graphviz
X, y = make_classification(n_samples=1000, n_features=10,n_informative=3, n_redundant=5, random_state=42)
df = pd.DataFrame(data=X)
df.columns = 'X' + (df.columns+1).astype(str)
df[df.columns[-3:]] = df[df.columns[-3:]].astype(int)
df['Y'] = y
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.3, random_state=42)
n_negative_class = y_train.value_counts().sort_index()[0]
n_positive_class = y_train.value_counts().sort_index()[1]
xgb = XGBClassifier(random_state = 42, n_estimators=50,
scale_pos_weight = n_negative_class/n_positive_class,
use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="auc")
y_train_scores = xgb.predict_proba(X_test)[:,1]
xgboost.to_graphviz(xgb, num_trees=49)
How can I plot the final tree used in xgb.predict_proba(X_test)[:,1]? Is necesarily the last one (as XGBoost trees learn from the last tree)? Or XGBoost chooses some tree among those 50 estimators given the loss or eval_metric given?

ElasticNetCV in Python vs cvglmnet in R

Has anybody tried to rich same results by implementing ElasticNetCV in Python and cvglmnet in R?
I have found out how to make it on ElasticNet in Python and glmnet in R but cannot reproduce it with cross validation methods...
Steps to reproduce in Python:
Preprocessing:
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
data = make_regression(
n_samples=100000,
random_state=0
)
X, y = data[0], data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
pd.DataFrame(X_train).to_csv('X_train.csv', index=None)
pd.DataFrame(X_test).to_csv('X_test.csv', index=None)
pd.DataFrame(y_train).to_csv('y_train.csv', index=None)
pd.DataFrame(y_test).to_csv('y_test.csv', index=None)
Models:
model = ElasticNet(
alpha=1.0,
l1_ratio=0.5,
fit_intercept=True,
normalize=True,
precompute=False,
max_iter=100000,
copy_X=True,
tol=0.0000001,
warm_start=False,
positive=False,
random_state=0,
selection='cyclic'
)
model.fit(
X=X_train,
y=y_train
)
y_pred = model.predict(
X=X_test
)
print(
mean_squared_error(
y_true=y_test,
y_pred=y_pred
)
)
output: 42399.49815189786
model = ElasticNetCV(
l1_ratio=0.5,
eps=0.001,
n_alphas=100,
alphas=None,
fit_intercept=True,
normalize=True,
precompute=False,
max_iter=100000,
tol=0.0000001,
cv=10,
copy_X=True,
verbose=0,
n_jobs=-1,
positive=False,
random_state=0,
selection='cyclic'
)
model.fit(
X=X_train,
y=y_train
)
y_pred = model.predict(
X=X_test
)
print(
mean_squared_error(
y_true=y_test,
y_pred=y_pred
)
)
output: 39354.729173913176
Steps to reproduce in R:
Preprocssing:
library(glmnet)
X_train <- read.csv(path)
X_test <- read.csv(path)
y_train <- read.csv(path)
y_test <- read.csv(path)
fit <- glmnet(x=as.matrix(X_train), y=as.matrix(y_train))
y_pred <- predict(fit, newx = as.matrix(X_test))
y_error = y_test - y_pred
mean(as.matrix(y_error)^2)
output: 42399.5
fit <- cv.glmnet(x=as.matrix(X_train), y=as.matrix(y_train))
y_pred <- predict(fit, newx = as.matrix(X_test))
y_error <- y_test - y_pred
mean(as.matrix(y_error)^2)
output: 37.00207
Thanks so much for providing the example, I am on a laptop so I had to reduce the number of samples to 100:
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
data = make_regression(
n_samples=100,
random_state=0
)
X, y = data[0], data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
When you do predict in with glmnet, you need to specify lambda, otherwise it returns the predictions for all lambdas, so in R:
fit <- glmnet(x=as.matrix(X_train), y=as.matrix(y_train))
y_pred <- predict(fit, newx = as.matrix(X_test))
dim(y_pred)
[1] 25 89
When you run cv.glmnet, it selects the best lambda from cv, the lambda.1se, so it gives you only 1 set, which is the rmse you wanted:
fit <- cv.glmnet(x=as.matrix(X_train), y=as.matrix(y_train))
y_pred <- predict(fit, newx = as.matrix(X_test))
y_error <- y_test - y_pred
mean(as.matrix(y_error)^2)
[1] 22.03504
dim(y_error)
[1] 25 1
fit$lambda.1se
[1] 1.278699
If we select the lambda closest to that chosen by cv.glmnet in glmnet, you get back something in the correct range:
fit <- glmnet(x=as.matrix(X_train), y=as.matrix(y_train))
sel = which.min(fit$lambda-1.278699)
y_pred <- predict(fit, newx = as.matrix(X_test))[,sel]
mean((y_test - y_pred)^2)
dim(y_error)
mean(as.matrix((y_test - y_pred)^2))
[1] 20.0775
Before we compare with sklearn, we need to make sure we are testing over the same range of lambdas.
L = c(0.01,0.05,0.1,0.2,0.5,1,2)
fit <- cv.glmnet(x=as.matrix(X_train), y=as.matrix(y_train),lambda=L)
y_pred <- predict(fit, newx = as.matrix(X_test))
y_error <- y_test - y_pred
mean(as.matrix(y_error)^2)
[1] 0.003065869
So we expect something in the range of 0.003065869. We run it with the same lambda, lambda is termed as alpha in ElasticNet. The alpha in glmnet is in fact your l1_ratio, see vignette. And the normalize option should be set to False, because:
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm. If you wish to
standardize, please use sklearn.preprocessing.StandardScaler before
calling fit on an estimator with normalize=False.
So we just run it using CV:
model = ElasticNetCV(l1_ratio=1,fit_intercept=True,alphas=[0.01,0.05,0.1,0.2,0.5,1,2])
model.fit(X=X_train,y=y_train)
y_pred = model.predict(X=X_test)
mean_squared_error(y_true=y_test,y_pred=y_pred)
0.0018007824874741929
It's around the same ball park as the R result.
And if you do it for ElasticNet, you will get the same result, if you specify alpha.

python the evaluation index valus are different largely between cross validate and train_test_split cases

write a program, use support vectore Regression-SVR to predict, firstly, split the dataset to train dataset and test dataset, the ratio of test dataset is 20%(case 1); secondly, use cross validate, split the dataset to 5 groups to predict(case 2),however, Using the same evaluation index(R2,MAE,MSE) to evaluate the two methods, the results are quite different
the program is as follows:
dataset = pd.read_csv('Dataset/allGlassStraightThroughTube.csv')
tube_par = dataset.iloc[:, 3:8].values
tube_eff = dataset.iloc[:, -1:].values
# # form train dataset , test dataset
tube_par_X_train, tube_par_X_test, tube_eff_Y_train, tube_eff_Y_test = train_test_split(tube_par, tube_eff, random_state=33, test_size=0.2)
# normalize the data
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_tube_par_X_train = sc_X.fit_transform(tube_par_X_train)
sc_tube_par_X_test = sc_X.transform(tube_par_X_test)
sc_tube_eff_Y_train = sc_Y.fit_transform(tube_eff_Y_train)
sc_tube_eff_Y_test = sc_Y.transform(tube_eff_Y_test)
# fit rbf SVR to the sc_tube_par_X dataset
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(sc_tube_par_X_train, sc_tube_eff_Y_train)
#
# # predict new result according to the sc_tube_par_X Dataset
pre_sc_tube_eff_Y_test = support_vector_regressor.predict(sc_tube_par_X_test)
pre_tube_eff_Y_test = sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)
# calculate the predict quality
print('R2-score value rbf SVR')
print(r2_score(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean squared error of rbf SVR is')
print(mean_squared_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean absolute error of rbf SVR is')
print(mean_absolute_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
# normalize
sc_tube_par_X = sc_X.fit_transform(tube_par)
sc_tube_eff_Y = sc_Y.fit_transform(tube_eff)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, sc_tube_par_X, sc_tube_eff_Y, cv=5, scoring=scoring, return_train_score=False)
in case 1, the evaluation index output is:
R2-score value rbf SVR
0.6486074476528559
The mean squared error of rbf SVR is
0.00013501023459497165
The mean absolute error of rbf SVR is
0.007196636233830076
in case 2, the evalution index output is:
R2-score
0.2621779727614816
test_neg_mean_squared_error
-0.6497292887710239
test_neg_mean_absolute_error
-0.5629408849740231
the difference between case 1 and case 2 is big, could you please me the reason and how to correct it
Bin.
I have prepared a little example to see how the results change using cross-validation. I recomend you to try to split the data without seed and see how the results change.
You will see that cross validation results are almost a constant independently of the data split.
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_validate
#from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
def print_metrics(real_y,predicted_y):
# calculate the predict quality
print('R2-score value {:>8.4f}'.format(r2_score(real_y, predicted_y)))
print('Mean squared error is {:>8.4f}'.format(mean_squared_error(real_y, predicted_y)))
print('Mean absolute error is {:>8.4f}\n\n'.format(mean_absolute_error(real_y, predicted_y)))
def show_plot(real_y,predicted_y):
fig,ax = plt.subplots()
ax.scatter(real_y,predicted_y,edgecolors=(0,0,0))
ax.plot([real_y.min(),real_y.max()],[real_y.min(),real_y.max()],"k--",lw=4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
# dataset load
boston = datasets.load_boston()
#dataset info
# print(boston.keys())
# print(boston.DESCR)
# print(boston.data.shape)
# print(boston.feature_names)
# numpy_arrays
X = boston.data
Y = boston.target
# # form train dataset , test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=5, test_size=0.2)
# fit scalers
sc_X = StandardScaler().fit(X_train)
# standarizes X (train and test)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
######################################################################
############################### SVR ##################################
######################################################################
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(X_train, Y_train)
predicted_Y = support_vector_regressor.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
########################### LINEAR REGRESSOR #########################
######################################################################
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
predicted_Y = lin_model.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
######################### SVR + CROSS VALIDATION #####################
######################################################################
sc = StandardScaler().fit(X)
standarized_X = sc.transform(X)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, standarized_X, Y, cv=10, scoring=scoring, return_train_score=False)
print(scores["test_r2"].mean())
print(-1*(scores["test_neg_mean_squared_error"].mean()))
print(-1*(scores["test_neg_mean_absolute_error"].mean()))

operand could not be broadcast together with shapes <56962,2> .. error

I try logistic regression classification using "k-fold cross validation" in python.
my code:
`import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
data = pd.read_csv('xxx.csv')
X = data[["a","b","c",...]]
y = data["Class"]
def get_predictions(clf, X_train, y_train, X_test):
clf = clf
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
pred_test_full=0
cv_score=[]
i=1
for train_index, test_index in skf.split(X, y):
X_train, y_train = X.loc[train_index], y.loc[train_index]
X_test, y_test = X.loc[test_index], y.loc[test_index]
log_cfl = LogisticRegression(C=2);
log_cfl.fit(X_train, y_train)
y_pred, y_pred_prob = get_predictions(LogisticRegression(C=2), X_train, y_train, X_test)
score=roc_auc_score(y_test,log_cfl.predict(X_test))
print('ROC AUC score: ',score)
cv_score.append(score)
pred_test_full = pred_test_full + y_pred_prob
i+=1`
I get error at this line of code:
`pred_test_full = pred_test_full + y_pred_prob`
For loop runs 2 times. Then in third, I get the error.
'operands could not be broadcast together with shapes <56962,2> <5696..' error.
I couldn't understand what is wrong, could you help to figure out?

Categories