Doing cross validation from scratch - python

I found this function definition on Stack Overflow:
def fold_i_of_k(dataset, i, k):
n = len(dataset)
return len(dataset[n*(i-1)//k:n*i//k])
# this is my code below
#Constants
FOLD_I = 1
FOLD_K =10
#Creating 10 folds
counter = 1
s=0
total_ac = 0
while counter!=FOLD_K+1:
print("Fold ",counter)
fold = fold_i_of_k(dataset,counter,10)
d_fold = dataset[s:s + fold]
#print(d_fold.index.values)
#print(d_fold.iloc[1:3,0:2])
d_test = d_fold
X_test = d_test.iloc[:,0:11]
y_test = d_test.iloc[:,11:12]
d_train = dataset.drop(dataset.index[s:s+fold])
X_train = d_train.iloc[:,0:11]
y_train = d_train.iloc[:,11:12]
##print(dataset)
##print(d_fold)
##print(d_train)
##print(d_test)
##print(len(X_train))
##print(len(y_train))
##print(X_test)
##print(y_test)
#print(fold)
X_train = X_train.as_matrix()
X_train = preprocessing.scale(X_train)
y_train = y_train.as_matrix()
X_test = X_test.as_matrix()
X_test = preprocessing.scale(X_test)
y_test = y_test.as_matrix()
#l1 = len(y_train)
#np.reshape(y_train, l1)
#print(y_train)
from numpy import array
#l = len(y_test)
#np.reshape(y_test, l)
#print(y_test)
data.reshape((data.shape[0], 1))
y_train = array(y_train)
print(y_train.shape)
lr = LogisticRegression()
lr.fit(X_train,y_train)
#lr_pred = lr.predict(X_test)
#ac = accuracy_score(y_test,lr_pred)
#print(ac)
##print(classification_report(y_test,lr_pred))
total_ac = total_ac + ac
s = s + fold
counter= counter+1
total_ac = total_ac / FOLD_K
print("Cross validation accuracy is: ",total_ac)`
I am getting following error:
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:578:
DataConversionWarning: A column-vector y was passed when a 1d array
was expected. Please change the shape of y to (n_samples, ), for
example using ravel().
y = column_or_1d(y, warn=True)
How can I fix it?

y_train.ravel() solved the problem.

Related

What is the function get_error_rate doing?

Below given code is an implementation of Adaboost algorithm. I am using this code to train on my dataset. I can understand most of the code but what I am not able to get is : What is the function get_error_rate doing here ? Is it calculating the entropy or Gini Impunity ?
Also the github link from where I have taken the code is https://github.com/jaimeps/adaboost-implementation
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
return sum(pred != Y) / float(len(Y))
""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
clf.fit(X_train,Y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
n_train, n_test = len(X_train), len(X_test)
# Initialize weights
w = np.ones(n_train) / n_train
pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
for i in range(M):
# Fit a classifier with the specific weights
clf.fit(X_train, Y_train, sample_weight = w)
pred_train_i = clf.predict(X_train)
pred_test_i = clf.predict(X_test)
# Indicator function
miss = [int(x) for x in (pred_train_i != Y_train)]
# Equivalent with 1/-1 to update weights
miss2 = [x if x==1 else -1 for x in miss]
# Error
err_m = np.dot(w,miss) / sum(w)
# Alpha
alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
# New weights
w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
# Add to prediction
pred_train = [sum(x) for x in zip(pred_train,
[x * alpha_m for x in pred_train_i])]
pred_test = [sum(x) for x in zip(pred_test,
[x * alpha_m for x in pred_test_i])]
pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
# Return error rate in train and test set
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
# Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
er_tree = generic_clf(y_train_1, X_train_1, y_test_1, X_test_1, clf_tree)
# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
er_train, er_test = [er_tree[0]], [er_tree[1]]
x_range = range(10, 410, 10)
for i in x_range:
er_i = adaboost_clf(y_train_1, X_train_1, y_test_1, X_test_1, i, clf_tree)
er_train.append(er_i[0])
er_test.append(er_i[1])

ValueError: x and y must have same first dimension, but have shapes (32,) and (30,)

I'm working on a stock predict model, and I'm getting this error although they have the same values!
This is the code where I'm training the model and using the Linear Regression
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)
clf = LinearRegression()
clf.fit(x_train, y_train)
scorepredict = clf.score(x_test,y_test)
predictedprice = clf.predict(x_prediction)
Here is where I get the error.
dates = pd.date_range(start = "2018-03-28", end = "2018-04-28")
plt.plot(dates,color ="y")
df["Adj. Close"].plot(color = "g")
plt.xlim(xmin=datetime.date(2017,1,1))
And here is where I setup the x and y arrays.
predictiondays = 30
df["Prediction"] = df[["Adj. Close"]].shift(-predictiondays)
x = np.array(df.drop(["Prediction"],1))
x = preprocessing.scale(x)
x_prediction = x[-predictiondays:]
x = x[:-predictiondays]
y = np.array(df["Prediction"])
y = y[:-predictiondays]
EDIT:
Now, in the clf.fit get this error:
ValueError: Expected 2D array, got 1D array instead:
array=[-1.23874374 1.49125839 0.80930081 ... 0.32190385 1.01987874
1.06322504].
I didn't touch anything lul.

ROC_CURVE- IndexError: too many indices for array

classification, when I input numpy arrays having test label and test probabilities, it throws the following error
dataset = read_csv('C:/.../dataset/KDDREAL.csv')
dataset = dataset.values
X = dataset[:, :-1]
Y = dataset[:, -1]
encoder = LabelEncoder().fit(Y)
encoded_Y = encoder.transform(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X , encoded_Y , test_size=0.3, random_state=0)
model_svm = svm.SVC()
model_svm.fit(X_train, Y_train)
results_svm = model_svm.predict(X_test)
fpr2 = dict()
tpr2 = dict()
roc_auc2 = dict()
for i in range(num_classes):
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
roc_auc2[i] = auc(fpr2[i], tpr2[i])
# Compute micro-average ROC curve and ROC area
fpr2, tpr2, _ = roc_curve(y_test.ravel(), PGD20_X_test_trm.ravel())
roc_auc2 = auc(fpr2, tpr2)
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
IndexError: too many indices for array
For the line of code given below -
for i in range(num_classes):
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
Try and replace it with -
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], results_svm[:, i])
In case you are copying this line please check for the indentation.

CatBoost border

I can't start catboost learning with catboost because of a small border.
X = pandas.read_csv("../input/x_y_test/X.csv")
X_test = pandas.read_csv("../input/x_y_test/X_test.csv")
y = pandas.read_csv("../input/y-data/y.csv")
X = X.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y = y.reset_index(drop = True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .3, random_state = 1337)
X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_val = y_val.reset_index(drop = True)
model_cb = CatBoostClassifier(eval_metric = "Accuracy", n_estimators = 1200, random_seed = 70)
model_cb.fit(X_train, y_train, eval_set = (X_val, y_val), use_best_model = True)
so I got
CatboostError: catboost/libs/metrics/metric.cpp:3929: All train targets are greater than border 0.5
data
https://drive.google.com/drive/folders/1m7bNIs0mZQQkAsvkETB3n6j62p9QJX39?usp=sharing
Your main error is that you're feeding y_train to your algo as:
id skilled
0 138177 0
1 36214 0
2 103206 1
3 22699 1
4 96145 1
I believe what you really intended was just y_train.skilled
Run reassignment like below before your fitting and you're fine to go:
y_train = y_train.skilled # just skill is enough
y_val = y_val.skilled # just skill is enough
model_cb = CatBoostClassifier(eval_metric = "Accuracy", n_estimators = 1200, random_seed = 70)
model_cb.fit(X_train, y_train, eval_set = (X_val, y_val), use_best_model = True)
On a side note, do you really believe id in X_train possesses any predictive ability. Why not drop it from features as well?

KNN Regression results in zero MSE on training set (sklearn)

Using sklearn and trying to evaluate a KNN regression function with the below code:
def cross_validate(X,y,n_neighbors, test_size=0.20):
training_mses = []
test_mses = []
n = X.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = np.arange(n)
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
X_test, y_test = X[test_indices], y[test_indices]
X_train,y_train = X[training_indices], y[training_indices]
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights = "distance",
algorithm = 'brute')
model = knn.fit(X_train,y_train)
y_hat = model.predict( X_train)
training_mse = mse( y_train - y_hat)
model2 = knn.fit(X_test,y_test)
y_hat = model2.predict( X_test)
test_mse = mse( y_test - y_hat)
return training_mse, test_mse
I did something similar with linear regression. The difference I have found is that when I run it on KNN regression, the training_mse and test_mse are both 0. If I use the test data on the model fitted with the training set, it gives me an mse value that is non-zero. But I just don't believe that the fitted values for the training and test set are the same as the observed values. What am I doing wrong? The function I was trying to emulate is below and gives non-zero values for mse:
def cross_validate( formula, data, test_size=0.20):
training_mses = []
test_mses = []
n = data.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = deepcopy( data.index).values
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
test_set = data.ix[ test_indices]
training_set = data.ix[ training_indices]
y, X = patsy.dmatrices( formula, training_set, return_type="matrix")
model = linear.LinearRegression( fit_intercept=False).fit( X, y)
y_hat = model.predict( X)
training_mse = mse( y - y_hat)
y, X = patsy.dmatrices( formula, test_set, return_type="matrix")
y_hat = model.predict( X)
test_mse = mse( y - y_hat)
return training_mse, test_mse

Categories