Complement Naive Bayes and weighted class in sklearn - python

I'm trying to implement a complement naive bayes classifier using sklearn. My data have very imbalanced classes (30k samples of class 0 and 6k samples of the 1 class) and I'm trying to compensate this using weighted class.
Here is the shape of my dataset:
enter image description here
I tried to use the compute compute_class_weight function to calcute the weights and then pass it to the fit function when training my model:
import numpy as np
import seaborn as sn
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.naive_bayes import ComplementNB
#Import the csv data
data = pd.read_csv('output_pt900.csv')
#Create the header of the csv file
header = []
for x in range(0,2500):
header.append('pixel' + str(x))
header.append('status')
#Add the header to the csv data
data.columns = header
#Replace the b's and the f's in the status column by 0 and 1
data['status'] = data['status'].replace('b',0)
data['status'] = data['status'].replace('f',1)
print(data)
#Drop the NaN values
data = data.dropna()
#Separate the features variables and the status
y = data['status']
x = data.drop('status',axis=1)
#Split the original dataset into two other: train and test
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
all_together = y_train.to_numpy()
unique_classes = np.unique(all_together)
c_w = class_weight.compute_class_weight('balanced', unique_classes, all_together)
clf = ComplementNB()
clf.fit(x_train,y_train, c_w)
y_predict = clf.predict(x_test)
cm = confusion_matrix(y_test, y_predict)
svm = sn.heatmap(cm, cmap='Blues', annot=True, fmt='g')
figure=svm.get_figure()
figure.savefig('confusion_matrix_cnb.png', dpi=400)
plt.show()
but I got thesse error:
ValueError: sample_weight.shape == (2,), expected (29752,)!
Anyone knows how to use weighted class in sklearn models?

compute_class_weight returns an array of length equal to the number of unique classes with the weight to assign to instances of each class (link). So if there are 2 unique classes, c_w has length 2, containing the weight that should be assigned to samples with label 0 and 1.
When calling fit for your model, the weight for each sample is expected by the sample_weight argument. This should explain the error you received. To solve this issue, you need to use c_w returned by compute_class_weight to create an array of individual sample weights. You could do this with [c_w[i] for i in all_together]. Your fit call would ultimately look something like:
clf.fit(x_train, y_train, sample_weight=[c_w[i] for i in all_together])

Related

how can I solve for predicted probabilities using Knn classifier?

I am using KNN classifier on a data set and am trying to find the predicted probabilities for each outcome of the prediction and am not sure how to go about it. I have not found much on this topic. The code I am using is:
import numpy as np ##Import necassary packages
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
url2="http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" #Reading in Data from a freely and easily available source on the internet
Adult = pd.read_csv(url2, header=None, skipinitialspace=True) #Decoding data by removing extra spaces in cplumns with skipinitialspace=True
##Assigning reasonable column names to the dataframe
Adult.columns = ["age","workclass","fnlwgt","education","educationnum","maritalstatus","occupation",
"relationship","race","sex","capitalgain","capitalloss","hoursperweek","nativecountry",
"less50kmoreeq50kn"]
Adult.loc[Adult.loc[:, "race"] == "Amer-Indian-Eskimo", "race"] = "Other" #consolidating catagorical data in the race column
Adult.loc[:,"race"].value_counts().plot(kind='bar') #plotting the consolidated catagorical data in the race column
plt.title('race after consolidation')
plt.show()
Adult.loc[:, "White"] = (Adult.loc[:, "race"] == "White").astype(int) #One hot encoding the catagorical/creating new categorical data in the race column
Adult.loc[:, "Black"] = (Adult.loc[:, "race"] == "Black").astype(int)
Adult.loc[:, "Asian-Pac-Islander"] = (Adult.loc[:, "race"] == "Asian-Pac-Islander").astype(int)
Adult.loc[:, "Other"] = (Adult.loc[:, "race"] == "Other").astype(int)
Adult.loc[:,"Other"] #Verifying One-hot encoding for Other column
Adult = Adult.drop("race", axis=1) #removing the obsolete column "race"
Minage = min(Adult.loc[:,"age"]) #MinMax normilizing the age column
Maxage = max(Adult.loc[:,"age"])
MinMaxage = (Adult.loc[:,"age"] - Minage)/(Maxage - Minage)
df2 = pd.DataFrame() #creating a dataframe to plot the normilized data
df2.loc[:,0] = Adult.loc[:, "White"] #filling the data frame
df2.loc[:,1] = NormZ1
df2.loc[:,1] = MinMaxage #assigning new columns for df2
df2.loc[:,2] = Adult.loc[:,"hoursperweek"]
df2.columns = ["White","MinMaxage","hoursperweek"] #labeling the columns for df2
df2.head() #checkiung new dataframe
X = np.array(df2.drop(["hoursperweek"], 1)) #choosing the expert label to predict and not including the label in the X array
y = np.array(df2["hoursperweek"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #splittting the data into training and testing data
clf = neighbors.KNeighborsClassifier() #assigning K neighbors classifier
clf.fit(X_train, y_train) #fitting the data for X_train and y_train
accuracy = clf.score(X_test, y_test) #finding the accuracy of the prediction
print("accuracy rate with age MinMax Normilized")
print(accuracy)
print ('predictions for test set with age MinMax Normilized:') #showing results
print(clf.predict(X_test))
print ('actual class values with age MinMax Normilized:')
print(y_test)
I have loaded the actual outcome and predicted outcome into a new data frame and would like to add a 3rd column in the new data frame with the predicted probabilities for each row, but I am not sure how to solve for these in python. Is there a way to solve for the predicted probabilities for each outcome? I am would like to use the predicted probabilities for a confusion matrix and an ROC curve.
from the above code in question, i have removed
df2.loc[:,1] = NormZ1
and re-ran the code, used the syntax
print(clf.predict_proba(X_test))
was able to get probabilities in shape(6513, 93)
you can try clf.predict_boba(X_test) to get the prediction probability. source

ROC curve for multi-class classification without one vs all in python

I have a multi-class classification problem with 9 different classes. I am using the AdaBoostClassifier class from scikit-learn to train my model without using the one vs all technique, as the number of classes is very high and it might be inefficient.
I have tried using the tips from the documentation in scikit learn [1], but there the one vs all technique is used, which is substantially different. In my approach I only get one prediction per event, i.e. if I have n classes, the outcome of the prediction is a single value within the n classes. For the one vs all approach, on the other hand, the outcome of the prediction is an array of size n with a sort of likelihood value per class.
[1]
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
The code is:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Matplotlib plotting library for basic visualisation
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing
# Read data
df = pd.read_pickle('data.pkl')
# Create the dependent variable class
# This will substitute each of the n classes from
# text to number
factor = pd.factorize(df['target_var'])
df.target_var= factor[0]
definitions = factor[1]
X = df.drop('target_var', axis=1)
y = df['target_var]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
bdt_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=250,
learning_rate=0.3)
bdt_clf.fit(X_train, y_train)
y_pred = bdt_clf.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s, 2s, etc. to their original values
reversefactor = dict(zip(range(9),definitions))
y_test_rev = np.vectorize(reversefactor.get)(y_test)
y_pred_rev = np.vectorize(reversefactor.get)(y_pred)
I tried directly with the roc curve function, and also binarising the labels, but I always get the same error message.
def multiclass_roc_auc(y_test, y_pred):
lb = preprocessing.LabelBinarizer()
lb.fit(y_test)
y_test = lb.transform(y_test)
y_pred = lb.transform(y_pred)
return roc_curve(y_test, y_pred)
multiclass_roc_auc(y_test, y_pred_test)
The error message is:
ValueError: multilabel-indicator format is not supported
How could this be sorted out? Am I missing some important concept?

import csv to OrderedDict and predict using regression

i build a regression model to predict energy ( 1 columns ) from 5 variables ( 5 columns ) ... i used my exprimental data to train and fit the model and it works with good score ...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('new.csv')
X = data.drop(['E'],1)
y = data['E']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5 ,
random_state=2)
from sklearn import ensemble
clf1 = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth =5,
min_samples_split = 2, loss='ls',
learning_rate = 0.1)
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)
but now i want to add a new csv file contain new data for mentioned 5 variables to OrderedDict and use the model to predict energy ...
with code bellow i manually insert row by row and it predict energy correctly
from collections import OrderedDict
new_data = OrderedDict([('H',48.52512), ('A',169.8379), ('P',55.52512),
('R',3.058758), ('Q',2038.055)])
new_data = pd.Series(new_data)
data = new_data.values.reshape(1, -1)
clf1.predict(data)
but i cant do this with huge datasets and need to import csv file ... i do the bellow but cant figure it out ....
data_2 = pd.read_csv('new2.csv')
X_new = OrderedDict(data_2)
new_data = pd.Series(X_new)
data = new_data.values.reshape(1, -1)
clf1.predict(data)
but gives me : ValueError: setting an array element with a sequence.
can anyone help me ??

Sklearn DecisionTreeClassifier F-Score Different Results with Each run

I'm trying to train a decision tree classifier using Python. I'm using MinMaxScaler() to scale the data, and f1_score for my evaluation metric. The strange thing is that I'm noticing my model giving me different results in a pattern at each run.
data in my code is a (2000, 7) pandas.DataFrame, with 6 feature columns and the last column being the target value. Columns 1, 3, and 5 are categorical data.
The following code is what I did to preprocess and format my data:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
# Data Preprocessing Step
# =============================================================================
data = pd.read_csv("./data/train.csv")
X = data.iloc[:, :-1]
y = data.iloc[:, 6]
# Choose which columns are categorical data, and convert them to numeric data.
labelenc = LabelEncoder()
categorical_data = list(data.select_dtypes(include='object').columns)
for i in range(len(categorical_data)):
X[categorical_data[i]] = labelenc.fit_transform(X[categorical_data[i]])
# Convert categorical numeric data to one-of-K data, and change y from Series to ndarray.
onehotenc = OneHotEncoder()
X = onehotenc.fit_transform(X).toarray()
y = y.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_val_scaled = min_max_scaler.fit_transform(X_val)
The next code is for the actual decision tree model training:
dectree = DecisionTreeClassifier(class_weight='balanced')
dectree = dectree.fit(X_train_scaled, y_train)
predictions = dectree.predict(X_val_scaled)
score = f1_score(y_val, predictions, average='macro')
print("Score is = {}".format(score))
The output that I get (i.e. the score) varies, but in a pattern. For example, it would circulate among data within the range of 0.39 and 0.42.
On some iterations, I even get the UndefinedMetricWarning, that claims "F-score is ill-defined and being set to 0.0 in labels with no predicted samples."
I'm familiar with what the UndefinedMetricWarning means, after doing some searching on this community and Google. I guess the two questions I have may be organized to:
Why does my output vary for each iteration? Is there something in the preprocessing stage that happens which I'm not aware of?
I've also tried to use the F-score with other data splits, but I always get the warning. Is this unpreventable?
Thank you.
You are splitting the dataset into train and test which randomly divides sets for both train and test. Due to this, when you train your model with different training data everytime, and testing it with different test data, you will get a range of F score depending on how well the model is trained.
In order to replicate the result each time you run, use random_state parameter. It will maintain a random number state which will give you the same random number each time you run. This shows that the random numbers are generated in the same order. This can be any number.
#train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=13)
#Decision tree model
dectree = DecisionTreeClassifier(class_weight='balanced', random_state=2018)

How to handle categorical variables in sklearn GradientBoostingClassifier?

I am attempting to train models with GradientBoostingClassifier using categorical variables.
The following is a primitive code sample, just for trying to input categorical variables into GradientBoostingClassifier.
from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier
import pandas
iris = datasets.load_iris()
# Use only data for 2 classes.
X = iris.data[(iris.target==0) | (iris.target==1)]
Y = iris.target[(iris.target==0) | (iris.target==1)]
# Class 0 has indices 0-49. Class 1 has indices 50-99.
# Divide data into 80% training, 20% testing.
train_indices = list(range(40)) + list(range(50,90))
test_indices = list(range(40,50)) + list(range(90,100))
X_train = X[train_indices]
X_test = X[test_indices]
y_train = Y[train_indices]
y_test = Y[test_indices]
X_train = pandas.DataFrame(X_train)
# Insert fake categorical variable.
# Just for testing in GradientBoostingClassifier.
X_train[0] = ['a']*40 + ['b']*40
# Model.
clf = GradientBoostingClassifier(learning_rate=0.01,max_depth=8,n_estimators=50).fit(X_train, y_train)
The following error appears:
ValueError: could not convert string to float: 'b'
From what I gather, it seems that One Hot Encoding on categorical variables is required before GradientBoostingClassifier can build the model.
Can GradientBoostingClassifier build models using categorical variables without having to do one hot encoding?
R gbm package is capable of handling the sample data above. I'm looking for a Python library with equivalent capability.
pandas.get_dummies or statsmodels.tools.tools.categorical can be used to convert categorical variables to a dummy matrix. We can then merge the dummy matrix back to the training data.
Below is the example code from the question with the above procedure carried out.
from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve,auc
from statsmodels.tools import categorical
import numpy as np
iris = datasets.load_iris()
# Use only data for 2 classes.
X = iris.data[(iris.target==0) | (iris.target==1)]
Y = iris.target[(iris.target==0) | (iris.target==1)]
# Class 0 has indices 0-49. Class 1 has indices 50-99.
# Divide data into 80% training, 20% testing.
train_indices = list(range(40)) + list(range(50,90))
test_indices = list(range(40,50)) + list(range(90,100))
X_train = X[train_indices]
X_test = X[test_indices]
y_train = Y[train_indices]
y_test = Y[test_indices]
###########################################################################
###### Convert categorical variable to matrix and merge back with training
###### data.
# Fake categorical variable.
catVar = np.array(['a']*40 + ['b']*40)
catVar = categorical(catVar, drop=True)
X_train = np.concatenate((X_train, catVar), axis = 1)
catVar = np.array(['a']*10 + ['b']*10)
catVar = categorical(catVar, drop=True)
X_test = np.concatenate((X_test, catVar), axis = 1)
###########################################################################
# Model and test.
clf = GradientBoostingClassifier(learning_rate=0.01,max_depth=8,n_estimators=50).fit(X_train, y_train)
prob = clf.predict_proba(X_test)[:,1] # Only look at P(y==1).
fpr, tpr, thresholds = roc_curve(y_test, prob)
roc_auc_prob = auc(fpr, tpr)
print(prob)
print(y_test)
print(roc_auc_prob)
Thanks to Andreas Muller for instructing that pandas Dataframe should not be used for scikit-learn estimators.
Sure it can handle it, you just have to encode the categorical variables as a separate step on the pipeline. Sklearn is perfectly capable of handling categorical variables as well as R or any other ML package. The R package is still (presumably) doing one-hot encoding behind the scenes, it just doesn't separate the concerns of encoding and fitting in this case (as it arguably should).

Categories