ValueError("x and y must be the same size")

ValueError("x and y must be the same size") - python

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn import metrics, model_selection
from xgboost.sklearn import XGBClassifier
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble
warnings.filterwarnings('ignore')
train = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/train.csv (6)/train.csv')
test = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test.csv (2)/test.csv')
test_labels=pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test_labels.csv/test_labels.csv')
print("\nTrain Data")
print("==========\n",train)
print("\nTest Data")
print("==========\n",test)
print("\nTest_labels Data")
print("================\n",test_labels)
sns.barplot(x='toxic', y='identity_hate', data=train);
plt.show()
print("\n\nTrain data shape:",train.shape)
print("\nTest data shape:",test.shape)
print("\nTestLabels data shape:",test_labels.shape)
print("\nCorrelation matrix")
print("==================")
plt.title('Correlation Matrix')
sns.heatmap(train.corr())
plt.show()
print("\n Data Descriptive")
print("================\n",train.describe())
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 10)
print("\nRegressor")
print("===========\n",xg_reg)
X = test_labels.iloc[:,1:6].values
Y = test_labels.iloc[:,6].values
#print("X value\n",X,"\n\nY value \n",Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)
print("\n Classifier")
print("============\n",model)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
params = {
'n_estimators': 1,
'max_depth': 1,
'learning_rate': 1,
'criterion': 'mse'
}
gradient_boosting_regressor = ensemble.GradientBoostingRegressor(**params)
gradient_boosting_regressor.fit(X, Y)
plt.figure(figsize=(10, 5))
plt.title('Gradient Boosting model (1 estimators, Single tree split)')
plt.scatter(X, Y)
plt.plot(X, gradient_boosting_regressor.predict(X), color='r')
plt.show()
While executing the above code this error occurs.
"raise ValueError("x and y must be the same size")"
I have .csv file with 1398 rows and 2 columns. I have taken 40% as a y_test set, as it is visible in the above code.

Related

f_importances function in sklearn

I found this question here which seems to address my problem(Determining the most contributing features for SVM classifier in sklearn).
However as my understanding of Python language is limited I need some help.
I have a dependent variable which is 'Group' that has two levels 'Group1' and 'Group2'.
This is the code I found, adapted to my data:
import pandas as pd
df = pd.read_csv('C:/Users/myPC/OneDrive/Desktop/analysis/dataframe6.csv')
X = df.drop('Group', axis=1)
y = df['Group']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_, features_names)
It produces just a blank plot.
I think there is something I should change in features_names = ['input1', 'input2'] but I am not sure what.

The code you used to plot expects a one-dimensional array. The attribute coef_, according to the documentation will be:
coef_ ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)
Weights assigned to the features when kernel="linear".
Using an example :
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
np.random.seed(123)
df = pd.DataFrame(np.random.uniform(0,1,(400,3)),columns=['input1','input2','input3'])
df['Group'] = np.random.choice(['Group1','Group2'],400)
X = df.drop('Group', axis=1)
y = df['Group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
We check the shape of the array:
print(svclassifier.coef_.shape)
(1, 3)
Because you have only 2 class, there's only 1 row. We can do:
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2','input3']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_[0], features_names)
This is the plot I got :

How to visualize cluster boundaries

I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here

scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:

K Fold Cross validation

I want to implement the cross validation in Random Forest Regressor in my data set. I want to know if my code is correct or not? Is this the way to cross validate?
Here is my sample data:
Wavelength Phase_velocity Shear_wave_velocity
1.50 202.69 240.73
1.68 192.72 240.73
1.79 205.54 240.73
........
Here is my code:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold,train_test_split,cross_val_score
df = pd.read_csv("5.5-6.csv")
df.head()
X = df[['wavelength', 'phase velocity']]
y = df['shear wave velocity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (len(X_train),len(X_test),len(y_train),len(y_test))
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)
rf = RandomForestRegressor(n_estimators=30000)
rf.fit(X_train, y_train)
results = cross_val_score(rf, X_train, y_train, cv=kfold) #Cross validation on training set
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (rf.predict(X_test)) #array_output
print (y_test)
print (rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
from sklearn.metrics import mean_absolute_error
print (mean_absolute_error(y_test,y_pred))
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(rmse)

Adaboosting Ensemble learning

I am implemeting Adaboost algorithm with Decision tree from sklearn library and once i predict the results i got an error any explanation and thank you :
The error is :
AdaBoostClassifier with algorithm='SAMME.R' requires that the weak learner supports the calculation of class probabilities with a predict_proba method.
Please change the base estimator or set algorithm='SAMME' instead.
The dataset is from UCI repository you can access it from this link :
https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/
The code is as follow:
from sklearn.model_selection import *
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
np.random.seed(1)
selected_features=['sex', 'cp','fbs', 'exang']
X=datascaled[selected_features]
Y=datascaled['num']
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=0)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [1, 2]
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
model1=GridSearchCV(ABC, param_grid=param_grid, scoring = 'accuracy')
model1.fit(X_train,y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model1.best_params_)
prediction=model1.predict(X_test)
#importing the metrics module
from sklearn import metrics
#evaluation(Accuracy)
print("Accuracy:",metrics.accuracy_score(prediction,y_test))
#evaluation(Confusion Metrix)
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,y_test))
#after tuning
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from time import *
from sklearn import metrics
n_folds=10
model=AdaBoostClassifier(random_state = 11,base_estimator= 'entropy', n_estimators= 2)
cv = 10
t0 = time()
y_pred = cross_val_predict(model, X=X, y=Y, n_jobs=-1, cv=cv)
t = time() - t0
print("=" * 52)
print("time cost: {}".format(t))
print()
print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
print()
print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
print("\t\troc_auc_score: {}".format(metrics.roc_auc_score(y, y_pred)))
print("\t\tcohen_kappa_score: {}".format(metrics.cohen_kappa_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(metrics.classification_report(y, y_pred))

roc_auc in VotingClassifier, RandomForestClassifier in scikit-learn (sklearn)

I am trying to calculate roc_auc for hard votingclassifier that i build . i present the code with reprodcible example. now i want to calculate the roc_auc score and plot ROC curver but unfortunately i got the following error predict_proba is not available when voting='hard'
# Voting Ensemble for Classification
import pandas
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,confusion_matrix, f1_score, precision_score, recall_score, cohen_kappa_score,accuracy_score,roc_curve
import numpy as np
np.random.seed(42)
iris = datasets.load_iris()
X = iris.data[:, :4] # we only take the first two features.
Y = iris.target
print(Y)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
estimators.append(('RandomForest', model2))
model3 = MultinomialNB()
estimators.append(('NaiveBayes', model3))
model4=SVC(probability=True)
estimators.append(('svm', model4))
model5=DecisionTreeClassifier()
estimators.append(('Cart', model5))
# create the ensemble model
print('Majority Class Labels (Majority/Hard Voting)')
ensemble = VotingClassifier(estimators,voting='hard')
#accuracy
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold,scoring='accuracy')
y_pred = cross_val_predict(ensemble, X ,Y, cv=10)
print("Accuracy ensemble model : %0.2f (+/- %0.2f) " % (results.mean(), results.std() ))
print(results.mean())
#recall
recall_scorer = make_scorer(recall_score, pos_label=1)
recall = cross_val_score(ensemble, X, Y, cv=kfold, scoring=recall_scorer)
print('Recall', np.mean(recall), recall)
# Precision
precision_scorer = make_scorer(precision_score, pos_label=1)
precision = cross_val_score(ensemble, X, Y, cv=kfold, scoring=precision_scorer)
print('Precision', np.mean(precision), precision)
#f1_score
f1_scorer = make_scorer(f1_score, pos_label=1)
f1_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring=f1_scorer)
print('f1_score ', np.mean(f1_score ),f1_score )
#roc_auc_score
roc_auc_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )

To calculate the roc_aucmetric you first need to
Replace: ensemble = VotingClassifier(estimators,voting='hard')
with: ensemble = VotingClassifier(estimators,voting='soft').
Next, the last 2 lines of code will throw an error:
roc_auc_score = cross_val_score(ensemble, X, Y, cv=3, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
ValueError: multiclass format is not supported
This is normal since in Y you have 3 classes (np.unique(Y) == array([0, 1, 2])).
You can't use roc_auc as a single summary metric for multiclass models. If you want, you could calculate **per-class roc_auc.**
How to solve this:
1) Use only two classes to calculate the roc_auc_score
2) use label binarization in advance vefore calling roc_auc_score

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

ValueError("x and y must be the same size") - python

Related

f_importances function in sklearn

How to visualize cluster boundaries

K Fold Cross validation

Adaboosting Ensemble learning

roc_auc in VotingClassifier, RandomForestClassifier in scikit-learn (sklearn)

Categories

Resources