import pandas
with open('aotiz.csv', 'r') as csvfile:
aotiz = pandas.read_csv(csvfile)
test = aotiz.loc[16:7000]
# Generate the train set with the rest of the data.
train = aotiz.loc[7000:7006]
x_columns = distance_columns
y_column = ["PM2.5"]
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics
knn = KNeighborsRegressor(n_neighbors=6)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])
actual = test[y_column]
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
print(mse)
I'm trying to know how to get the accuracy of this model from scikit-learn. For the moment I could only get the mean squared error, but how do I compare the 'actual' and the 'predictions' sets to see with the percentage the error that I have from the 'actual' list.
Related
I have regression task and I am predicting here with linear regression and random-forest models. Need some hints or code example how to ensemble them (averaging already done). Here are my model realizations with python:
np.random.seed(42)
mask = np.random.rand(happiness2.shape[0]) <= 0.7
print('Train set shape {0}, test set shape {1}'.format(happiness2[mask].shape, happiness2[~mask].shape))
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(happiness22[mask].drop(['Country', 'Happiness_Score_2017',
'Happiness_Score_2018','Happiness_Score_2019'], axis=1).fillna(0),
happiness22[mask]['Happiness_Score_2019'] )
pred = lr.predict(happiness22[~mask].drop(['Country', 'Happiness_Score_2017',
'Happiness_Score_2018','Happiness_Score_2019'], axis=1).fillna(0))
print('RMSE = {0:.04f}'.format(np.sqrt(np.mean((pred - happiness22[~mask]['Happiness_Score_2019'])**2))))
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(happiness22[mask].drop(['Country', 'Happiness_Score_2017',
'Happiness_Score_2018','Happiness_Score_2019'], axis=1).fillna(0),
happiness22[mask]['Happiness_Score_2019'] )
pred3 = rf.predict(happiness22[~mask].drop(['Country', 'Happiness_Score_2017',
'Happiness_Score_2018','Happiness_Score_2019'], axis=1).fillna(0))
print('RMSE = {0:.04f}'.format(np.sqrt(np.mean((pred3 - happiness22[~mask]['Happiness_Score_2019'])**2))))
avepred=(pred+pred3)/2
print('RMSE = {0:.04f}'.format(np.sqrt(np.mean((avepred - happiness22[~mask]['Happiness_Score_2019'])**2))))
First, you can evaluate each model (linear regression and random forest) on a validation set and get out the error (MSE for instance).
Then, weight each model according to this error and use this weight later when predicting.
You can use also cobra ensemble method (developped by Guedj et al.)
https://modal.lille.inria.fr/pycobra/
I'm trying to compare the RMSE I have from performing multiple linear regression upon the full data set, to that of 10-fold cross validation, using the KFold module in scikit learn. I found some code that I tried to adapt but I can't get it to work (and I suspect it never worked in the first place.
TIA for any help!
Here's my linear regression function
def standRegres(xArr,yArr):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
if np.linalg.det(xTx) == 0.0:
print("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
## I run it on my matrix ("comm_df") and my dependent var (comm_target)
## Calculate RMSE (omitted some code)
initial_regress_RMSE = np.sqrt(np.mean((yHat_array - comm_target_array)**2)
## Now trying to get RMSE after training model through 10-fold cross validation
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
kf = KFold(n_splits=10)
xval_err = 0
for train, test in kf:
linreg.fit(comm_df,comm_target)
p = linreg.predict(comm_df)
e = p-comm_target
xval_err += np.sqrt(np.dot(e,e)/len(comm_df))
rmse_10cv = xval_err/10
I get an error about how kfold object is not iterable
There are several things you need to correct in this code.
You cannot iterate over kf. You can only iterate over kf.split(comm_df)
You need to somehow use the train test split that KFold provides. You are not using them in your code! The goal of the KFold is to fit your regression on the train observations, and to evaluate the regression (ie compute the RMSE in your case) on the test observations.
With this in mind, here is how I would correct your code (it is assumed here that your data is in numpy arrays, but you can easily switch to pandas)
kf = KFold(n_splits=10)
xval_err = 0
for train, test in kf.split(comm_df):
linreg.fit(comm_df[train],comm_target[train])
p = linreg.predict(comm_df[test])
e = p-comm_label[test]
xval_err += np.sqrt(np.dot(e,e)/len(comm_target[test]))
rmse_10cv = xval_err/10
So the code you provided still threw an error. I abandoned what I had above in favor of the following, which works:
## KFold cross-validation
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
## Define variables for the for loop
kf = KFold(n_splits=10)
RMSE_sum=0
RMSE_length=10
X = np.array(comm_df)
y = np.array(comm_target)
for loop_number, (train, test) in enumerate(kf.split(X)):
## Get Training Matrix and Vector
training_X_array = X[train]
training_y_array = y[train].reshape(-1, 1)
## Get Testing Matrix Values
X_test_array = X[test]
y_actual_values = y[test]
## Fit the Linear Regression Model
lr_model = LinearRegression().fit(training_X_array, training_y_array)
## Compute the predictions for the test data
prediction = lr_model.predict(X_test_array)
crime_probabilites = np.array(prediction)
## Calculate the RMSE
RMSE_cross_fold = RMSEcalc(crime_probabilites, y_actual_values)
## Add each RMSE_cross_fold value to the sum
RMSE_sum=RMSE_cross_fold+RMSE_sum
## Calculate the average and print
RMSE_cross_fold_avg=RMSE_sum/RMSE_length
print('The Mean RMSE across all folds is',RMSE_cross_fold_avg)
I am in the process of building a regression model that will eventually be used by other users. This model serves to predict flower temperature by using multiple atmospheric variables such as air temperature, humidity, solar radiation, wind, etc.
After much doodling around, I've come to notice that a 2nd degree polynomial regression through SKlearn gives a good RMSE for both my training and testing data. However, since there are over 36 coefficients collinearity occurs and according to a comment on this post : https://stats.stackexchange.com/questions/29781/when-conducting-multiple-regression-when-should-you-center-your-predictor-varia, collinearity would disturbe the beta and so the RMSE I am getting would be improper.
I've heard that perhaps I should standardize in order to remove collinearity or use an orthogonal decomposition but I don't know which would be better. In any case, i've tried standardizing my x variables and when I compute the RMSE for my training and testing data, I get the same RMSE for the training data but a different RMSE for the testing data.
Here is the code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
def OpenFile(ThePath):
path = Location + ThePath
Prepared_df = pd.read_csv(path, sep=',', encoding='utf-8')
Prepared_df = Prepared_df.loc[:, ~Prepared_df.columns.str.contains('^Unnamed')]
return(Prepared_df)
def EvaluateRegression(Test_data,Predict_data):
MAE = np.round(metrics.mean_absolute_error(Test_data, Predict_data),3)
MSE = np.round(metrics.mean_squared_error(Test_data, Predict_data),3)
RMSE = np.round(np.sqrt(metrics.mean_squared_error(Test_data, Predict_data)),3)
print('Mean absolute error :',MAE)
print('Mean square error :',MSE)
print('RMSE :',RMSE)
return MAE,MSE,RMSE
#Read files ------------------------------------------------------------------------------------------------------------
Location = 'C:\\Users\...'
#Training data
File_Station_day = 'Flower_Station_data_day.csv' #X training data
File_TD = 'Flower_Y_data_day.csv' #Y training data
Chosen_Air = OpenFile(File_Station_day)
Day_TC = OpenFile(File_TD)
#Testing data
File_Fluke_Station= 'Fluke_Station_data.csv' #X testing data
File_Fluke = 'Flower_Fluke_data.csv' #Y testing data
Chosen_Air_Fluke = OpenFile(File_Fluke)
Fluke_Station = OpenFile(File_Fluke_Station)
#Prepare data --------------------------------------------------------------------------------------------------------
y_train = Day_TC
y_test = Fluke_data
#Get the desired atmospheric variables
Air_cols = ['MAXTemp_data', 'MINTemp_data', 'Humidity', 'Precipitation', 'Pression', 'Arti_InSW', 'sin_time'] #Specify the desired atmospheriv variables
X_train = Chosen_Air[Air_cols]
X_test = Chosen_Air_Fluke[Air_cols]
#If not standardizing
poly = PolynomialFeatures(degree=2)
linear_poly = LinearRegression()
X_train_rdy = poly.fit_transform(X_train)
linear_poly.fit(X_train_rdy,y_train)
X_test_rdy = poly.fit_transform(X_test)
Input_model= linear_poly
print('Regression: For train')
MAE, MSE, RMSE = EvaluateRegression(y_train, Input_model.predict(X_train_rdy))
#For testing data
print('Regression: For test')
MAE, MSE, RMSE = EvaluateRegression(y_test, Input_model.predict(X_test_rdy))
#Output:
Regression: For train
Mean absolute error : 0.391
Mean square error : 0.256
RMSE : 0.506
Regression: For test
Mean absolute error : 0.652
Mean square error : 0.569
RMSE : 0.754
#If standardizing
std = StandardScaler()
X_train_std = pd.DataFrame(std.fit_transform(X_train),columns = Air_cols)
X_test_std = pd.DataFrame(std.fit_transform(X_test),columns = Air_cols)
poly = PolynomialFeatures(degree=2)
linear_poly_std = LinearRegression()
X_train_std_rdy = poly.fit_transform(X_train_std)
linear_poly_std.fit(X_train_std_rdy,y_train)
X_test_std_rdy = poly.fit_transform(X_test_std)
Input_model= linear_poly_std
print('Regression: For train')
MAE, MSE, RMSE = EvaluateRegression(y_train, Input_model.predict(X_train_std_rdy))
#For testing data
print('Regression: For test')
MAE, MSE, RMSE = EvaluateRegression(y_test, Input_model.predict(X_test_std_rdy))
#Output:
Regression: For train
Mean absolute error : 0.391
Mean square error : 0.256
RMSE : 0.506
Regression: For test
Mean absolute error : 10.901
Mean square error : 304.53
RMSE : 17.451
Why is the RMSE i am getting for the standardize testing data be so different than the non-standardize one? Perhaps the way i'm doing this is no good at all? Please let me know if I should attach the files to the post.
Thank you for your time!
IIRC, at least you should not call poly.fit_transform twice – you do it same way as with regression model – fit once with train data, transform later with test. Now you're re-training scaler (which probably gives you different mean/std), but apply same regression model.
Side note: your code is rather hard to read/debug, and it easily lead to simple typos/mistakes. I suggest you wrapping training logic inside single function, and optionally using sklearn pipelines. This will make testing scaler [un]commenting single line, literally.
import pandas as pd
import numpy
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
fi = "df.csv"
# Open the file for reading and read in data
file_handler = open(fi, "r")
data = pd.read_csv(file_handler, sep=",")
file_handler.close()
# split the data into training and test data
train, test = cross_validation.train_test_split(data,test_size=0.6, random_state=0)
# initialise Gaussian Naive Bayes
naive_b = GaussianNB()
train_features = train.ix[:,0:127]
train_label = train.iloc[:,127]
test_features = test.ix[:,0:127]
test_label = test.iloc[:,127]
naive_b.fit(train_features, train_label)
test_data = pd.concat([test_features, test_label], axis=1)
test_data["p_malw"] = naive_b.predict_proba(test_features)
print "test_data\n",test_data["p_malw"]
print "Accuracy:", naive_b.score(test_features,test_label)
I have written this code to accept input from a csv file with 128 columns where 127 columns are features and the 128th column is the class label.
I want to predict probability that the sample belongs to each class (There are 5 classes (1-5)) and print it in for of a matrix and determine the class of sample based on the prediction. predict_proba() is not giving the desired output. Please suggest required changes.
GaussianNB.predict_proba returns the probabilities of the samples for each class in the model. In your case, it should return a result with five columns with the same number of rows as in your test data. You can verify which column corresponds to which class using naive_b.classes_ . So, it is not clear why you are saying that this is not the desired output. Perhaps, your problem comes from the fact that you are assigning the output of predict proba to a data frame column. Try:
pred_prob = naive_b.predict_proba(test_features)
instead of
test_data["p_malw"] = naive_b.predict_proba(test_features)
and verify its shape using pred_prob.shape. The second dimension should be 5.
If you want the predicted label for each sample you can use the predict method, followed by confusion matrix to see how many labels have been predicted correctly.
from sklearn.metrics import confusion_matrix
naive_B.fit(train_features, train_label)
pred_label = naive_B.predict(test_features)
confusion_m = confusion_matrix(test_label, pred_label)
confusion_m
Here is some useful reading.
sklearn GaussianNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB.predict_proba
sklearn confusion_matrix - http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
I currently run a random forest model using the following code. I set a random_state equal to 100.
from sklearn.cross_validation import train_test_split
X_train_RIA_INST_PWM, X_test_RIA_INST_PWM, y_train_RIA_INST_PWM, y_test_RIA_INST_PWM = train_test_split(X_RIA_INST_PWM, Y_RIA_INST_PWM, test_size=0.3, random_state = 100)
# Random Forest Regressor for RIA_INST_PWM accounts
import numpy as np
from sklearn.ensemble import RandomForestRegressor
regressor_RIA_INST_PWM = RandomForestRegressor(n_estimators=100, min_samples_split = 10)
regressor_RIA_INST_PWM.fit(X_RIA_INST_PWM, Y_RIA_INST_PWM)
print ("R^2 for training set:"),
print (regressor_RIA_INST_PWM.score(X_train_RIA_INST_PWM, y_train_RIA_INST_PWM))
print ('-'*50)
print ("R^2 for test set:"),
print (regressor_RIA_INST_PWM.score(X_test_RIA_INST_PWM, y_test_RIA_INST_PWM))
And then I use the following code to calculate the prediction values.
def predict_AUM(df, features, regressor):
# Reset index for later merge of predicted target values with Account IDs
df.reset_index();
# Set predictor variables
X_Predict = df[features]
# Clean inputs
X_Predict = X_Predict.replace([np.inf, -np.inf], np.nan)
X_Predict = X_Predict.fillna(0)
# Predict Current_AUM
Y_AUM_Snapshot_1yr_Predict = regressor.predict(X_Predict)
df['PREDICTED_SPAN'] = Y_AUM_Snapshot_1yr_Predict
return df
df_EVENT5_20 = predict_AUM(df_EVENT5_19, dfzip_features_AUM_RIA_INST_PWM, regressor_RIA_INST_PWM)
Finally, I calculate the RMSE of my results:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(df_EVENT5_20['SPAN_DAYS'], df_EVENT5_20['PREDICTED_SPAN']))
rmse
Each time I run my code ... my RMSE changes. It has varied from 7.75 to 16.4 Why is this happening? And how can I have the same RMSE each time I run the code? Additionally, how do I optimize my model for RMSE?
You only seeded the train_test_split which makes sure that the random allocation of the data to train and test set is reproducible.
As the name suggests RandomForestRegressor also contains parts in the algorithm that rely on random numbers (e.g., specifically different parts of data or different features for training individual decision trees). If you want reproducible results, you need to seed it as well. For that you need to initilize it with random_state like that:
regressor_RIA_INST_PWM = RandomForestRegressor(
n_estimators=100,
min_samples_split = 10,
random_state=100
)