Identifying misclassified raw data in after machine learning in Python - python

I have a pretty basic question. My X data is df['input'], Y data is df['label']. This is my code:
from sklearn.feature_extraction.text import TfidfVectorizer
Xfeatures = df['input']
y = df['label']
tfidf_vec = TfidfVectorizer(max_features= MF,
max_df = MAXDF)
X = tfidf_vec.fit_transform(Xfeatures)
featurenames = tfidf_vec.get_feature_names()
X.todense()
df_vec = pd.DataFrame(X.todense(),columns=tfidf_vec.get_feature_names())
df_vec.T
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y,test_size = 0.33,random_state = 28)
This is the model that I run for text classification:
from sklearn.svm import SVC
lr_model = SVC()
lr_model.fit(x_train,y_train)
y_pred = lr_model.predict(x_test)
# Acccuracy
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
I would like to identify the those that are misclassified (i.e. df['input']). I can write down predicted and actual the categories into csv, but not the text that is misclassified (or training data in general):
import csv
rows = zip(y_test, y_pred)
with open(r"C:\Users\erdem\Desktop\data.csv", "w", newline="") as f:
writer = csv.writer(f)
for row in rows:
writer.writerow(row)

Try to go with
X[y_test != y_pred]
y_test != y_pred will be a binary array with True on misclassified data (and False on correct predictions): you can use that as indices for your X (or Xfeatures).

Related

SHAP: Combine results across validation splits

I am hoping to combine (and plot) SHAP results across validation splits for my xgboost model. The closest I have found online is this with k-fold CV, but when I try both k-fold and train_test_split, i'm thrown this error:
AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.
For reproducibility, I fetched the data from here
Below is my code, adapted a little bit to work for my own data. A couple notes:
shap.summary_plot(shap_values[1], X_test) is changed to shap.summary_plot(shap_values, X_test) as otherwise I was given this error: AssertionError: Summary plots need a matrix of shap_values, not a vector.
I used Explainer rather than TreeExplainer as that was what I was able to run
import numpy as np,warnings,shap
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from tqdm import tqdm
mod_dir = 'C:/Users/User/OneDrive - UHN/ML examples/'
df = pd.read_csv('{}heart_disease.csv'.format(mod_dir))
model_vars = pd.read_csv('{}hd_model_vars.csv'.format(mod_dir))
cat_vars = model_vars[model_vars['data_type']=="category"]
cat_vars = cat_vars['variable'].to_list()
df[cat_vars] = df[cat_vars].astype("category")
ids_outcome = df[['id','out']]
df = df.drop('out',axis=1)
xgb = XGBClassifier(enable_categorical = True,eval_metric="logloss",use_label_encoder=False,tree_method = "hist")
x = df.copy()
y = ids_outcome.copy()
y['out'] = y['out'].astype(int)
ls_shap_values = []
ls_x_val = []
for i in tqdm(range(1,4)):
kf = KFold(n_splits=3,shuffle=True,random_state=i)
for train_index, val_index in kf.split(x):
pass
x_train = x.iloc[train_index]
y_train = y.iloc[train_index]
x_val = x.iloc[val_index]
y_val = y.iloc[val_index]
# Save IDs for merging later
train_ids = x_train[['id']]
val_ids = x_val[['id']]
# Set ID column as index for modelling
x_train = x_train.set_index('id')
y_train = y_train.set_index('id')
x_val = x_val.set_index('id')
y_val = y_val.set_index('id')
xgb.fit(x_train,y_train)
ls_x_val.append(val_index)
explainer = shap.Explainer(xgb.predict,x_val)
shap_values = explainer(x_val)
ls_shap_values.append(shap_values)
val_set = ls_x_val[0]
shap_values = np.array(ls_shap_values[0])
for i in range(1,3):
test_set = np.concatenate((val_set,ls_x_val[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(ls_shap_values[i])),axis=1)
#bringing back variable names
X_val = pd.DataFrame(x.iloc[test_set],columns=x.columns)
shap.summary_plot(shap_values,X_val)

How to predict an individual value using SKlearn?

I am very new to Machine Learning and I would like to get a percentage returned for an individual array that I pass in the prediction model I have created.
I'm not sure how to go about getting the match percentage. I thought it was metrics.accuracy_score(Ytest, y_pred) but when I try that it gives me the following error:
**ValueError: Found input variables with inconsistent numbers of samples: [4, 1]**
I have no idea if this is the correct way to go about this.
import numpy as np #linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #For Visualisation
import seaborn as sns #For better Visualisation
from bs4 import BeautifulSoup #For Text Parsing
import mysql.connector
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import docx2txt
import re
import csv
from sklearn import metrics
class Machine:
TrainData = ''
def __init__(self):
self.TrainData = self.GetTrain()
Data = self.ProcessData()
x = Data[0]
y = Data[1]
x, x_test, y, y_test = train_test_split(x,y, stratify = y, test_size = 0.25, random_state = 42)
self.Predict(x,y, '',x_test , y_test )
def Predict(self,X,Y,Data, Xtext, Ytest):
model = GaussianNB()
model.fit(Xtext, Ytest)
y_pred = model.predict([[1.0, 2.00613, 2, 5]])
print("Accuracy:",metrics.accuracy_score(Ytest, y_pred))
def ProcessData(self):
X = []
Y = []
i = 0
for I in self.TrainData:
Y.append(I[4])
X.append(I)
i = i + 1
i = 0
for j in X:
X[i][0] = float(X[i][0])
X[i][1] = float(X[i][1])
X[i][2] = int(X[i][2])
X[i][3] = int(X[i][3])
del X[i][4]
i = i + 1
return X,Y
def GetTrain(self):
file = open('docs/training/TI_Training.csv')
csvreader = csv.reader(file)
header = []
header = next(csvreader)
rows = []
for row in csvreader:
rows.append(row)
file.close()
return rows
Machine()
The error is pretty clear: YTest has 4 samples, and y_pred only has one. You need an equal number of samples in each to get any metrics. I suspect you instead want to do
y_pred = model.predict(Xtext)
in your Predict function.

Pickling an XGBoost model makes it give different results when predicting

I've tried to reproduce the issue with sinthetic data with no luck, it happens with a particular (private) dataset
Data generation
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
NOISE = 25
def real_vs_fitted(reg, X,y):
fitted = reg.predict(X)
real = y
residual = real - fitted
plt.scatter(fitted, residual)
NOISE = 25
samples = 1000
np.random.seed(42)
# Data
samples = 1000
A = np.random.randint(1,100,samples)
B = np.random.randint(1,100,samples)
C = np.random.randint(1,100,samples)
X = np.stack([A,B,C]).T
y = 3*A + 10*np.log(B) + A*B*C + np.random.normal(0,NOISE,len(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Training and serializing
reg = XGBRegressor()
reg.fit(X_train,y_train)
real_vs_fitted(reg,X_test,y_test)
print(f'Reg object Hash: {hash(reg)}')
with open('model.pkl', 'wb') as f:
pkl.dump(reg, f)
Unserializing
with open("model.pkl", "rb") as f:
reg = pkl.load(f)
real_vs_fitted(reg,X_test,y_test)
print(f'Reg object Hash: {hash(reg)}')
Hashes are different (I'm assuming it's because some attributes are not saved when serializing, maybe not).
X_test hash by row (pd.util.hash_pandas) is the same too
reg.get_booster().trees_to_dataframe() seems to be the same too
Any clues?

Make predictions with a trained model on Python

I'm very new to programming and machine learning but I've been trying to create a prediction model to tag product reviews. I found the following model:
import numpy as np
import pandas as pd
# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder
dataset = pd.read_csv('dataset.csv')
def normalize_text(s):
s = s.lower()
# remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
s = re.sub('\s\W',' ',s)
s = re.sub('\W\s',' ',s)
# make sure we didn't introduce any double spaces
s = re.sub('\s+',' ',s)
return s
dataset['TEXT'] = [normalize_text(s) for s in dataset['texto']]
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(dataset['TEXT'])
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['codigo'])
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_predicted = nb.predict(x_test)
So far so good. But then, I tried to use that trained model to predict another set of data like this:
#new data
test = pd.read_csv('testset.csv')
test['TEXT'] = [normalize_text(s) for s in test['respostas']]
# pull the data into vectors
vectorizer = CountVectorizer()
classes = vectorizer.fit_transform(test['TEXT'])
classificacao = nb.predict(classes)
However, I got a "ValueError: dimension mismatch"
I'm not sure how to do this second step, which is using the model to predict the category of a fresh data set.
Thanks in advance for your assistance.

scikit very low accuracy on classifiers(Naive Bayes, DecissionTreeClassifier)

I am using this dataset Weath Based on age and the documentation states that the accuracy should be around 84%. Unfortunately, the accuracy of my program is at 25%
To process the data I did the following:
1. Loaded the .txt data file and converted it to a .csv
2. Removed data with missing values
3. Extracted the class values: <=50K >50 and convert it to 0 and 1 respectively
4. For each attribute and for each string value of that attribute I
mapped it to an integer value. Example att1{'cs':0, 'cs2':1},
att2{'usa':0, 'greece':1} ... and so on
5. Called naive bayes on the new integer data set
Python code:
import load_csv as load #my functions to do [1..5] of the list
import numpy as np
my_data = np.genfromtxt('out.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data)) #this funcion removes the missing data
features_train = np.array(load.remove_field_num(data, len(data[0]) - 1)) #this function extracts the data, e.g removes the class in the end of the data
label_train = np.array(load.create_labels(data))
features_train = np.array(load.convert_to_int(features_train))
my_data = np.genfromtxt('test.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data))
features_test = np.array(load.remove_field_num(data, len(data[0]) - 1))
label_test = np.array(load.create_labels(data)) #extracts the labels from the .csv data file
features_test = np.array(load.convert_to_int(features_test)) #converts the strings to ints(each unique string of an attribute is assigned a unique integer value
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, label_train)
predict = clf.predict(features_test)
score = accuracy_score(predict, label_test) #Low accuracy score
load_csv module:
import numpy as np
attributes = { 'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':2, 'Federal-gov':3, 'Local-gov':4, 'State-gov':5, 'Without-pay':6, 'Never-worked':7,
'Bachelors':0, 'Some-college':1, '11th':2, 'HS-grad':3, 'Prof-school':4, 'Assoc-acdm':5, 'Assoc-voc':6, '9th':7, '7th-8th':8, '12th':9, 'Masters':10, '1st-4th':11, '10th':12, 'Doctorate':13, '5th-6th':14, 'Preschool':15,
'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6,
'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8,
'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13,
'Wife':0, 'Own-child':1, 'Husband':2, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':5,
'White':0, 'Asian-Pac-Islander':1, 'Amer-Indian-Eskimo':2, 'Other':3, 'Black':4,
'Female':0, 'Male':1,
'United-States':0, 'Cambodia':1, 'England':2, 'Puerto-Rico':3, 'Canada':4, 'Germany':5, 'Outlying-US(Guam-USVI-etc)':6, 'India':7, 'Japan':8, 'Greece':9, 'South':10, 'China':11, 'Cuba':12, 'Iran':13, 'Honduras':14, 'Philippines':15, 'Italy':16, 'Poland':17, 'Jamaica':18, 'Vietnam':19, 'Mexico':20, 'Portugal':21, 'Ireland':22, 'France':23, 'Dominican-Republic':24, 'Laos':25, 'Ecuador':26, 'Taiwan':27, 'Haiti':28, 'Columbia':29, 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33, 'Thailand':34, 'Yugoslavia':35, 'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40
}
def remove_field_num(a, i): #function to strip values
names = list(a.dtype.names)
new_names = names[:i] + names[i + 1:]
b = a[new_names]
return b
def remove_missing_values(data):
temp = []
for i in range(len(data)):
for j in range(len(data[i])):
if data[i][j] == '?': #If a missing value '?' is encountered do not append the line to temp
break;
if j == (len(data[i]) - 1) and len(data[i]) == 15:
temp.append(data[i]) #Append the lines that do not contain '?'
return temp
def create_labels(data):
temp = []
for i in range(len(data)): #Iterate through the data
j = len(data[i]) - 1 #Extract the labels
if data[i][j] == '<=50K':
temp.append(0)
else:
temp.append(1)
return temp
def convert_to_int(data):
my_lst = []
for i in range(len(data)):
lst = []
for j in range(len(data[i])):
key = data[i][j]
if j in (1, 3, 5, 6, 7, 8, 9, 13, 14):
lst.append(int(attributes[key]))
else:
lst.append(int(key))
my_lst.append(lst)
temp = np.array(my_lst)
return temp
I have tried to use both tree and NaiveBayes but the accuracy is very low. Any suggestions of what am I missing?
I guess the problem is in preprocessing. It is better to encode the categorical variables as one_hot vectors (vectors with only zero or ones where one corresponds to the desired value for that class) instead of raw numbers. Sklearn DictVectorizer can help you in that. You can do the classification much more efficiently with the pandas library.
The following shows how easily you can achieve that with help of pandas library. It works very well along side scikit-learn. This achieves accuracy of 81.6 on a test set that is 20% of the entire data.
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
# Read the data into a pandas dataframe
df = pd.read_csv('adult.data.csv')
# Columns names
cols = np.array(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'target'])
# numeric columns
numeric_cols = ['age', 'fnlwgt', 'education-num',
'capital-gain', 'capital-loss', 'hours-per-week']
# assign names to the columns in the dataframe
df.columns = cols
# replace the target variable to 0 and 1 for <50K and >50k
df1 = df.copy()
df1.loc[df1['target'] == ' <=50K', 'target'] = 0
df1.loc[df1['target'] == ' >50K', 'target'] = 1
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
df1.drop('target', axis=1), df1['target'], test_size=0.2)
# numeric attributes
x_num_train = X_train[numeric_cols].as_matrix()
x_num_test = X_test[numeric_cols].as_matrix()
# scale to <0,1>
max_train = np.amax(x_num_train, 0)
max_test = np.amax(x_num_test, 0) # not really needed
x_num_train = x_num_train / max_train
x_num_test = x_num_test / max_train # scale test by max_train
# labels or target attribute
y_train = y_train.astype(int)
y_test = y_test.astype(int)
# categorical attributes
cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)
cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)
x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()
# vectorize (encode as one hot)
vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)
# build the feature vector
x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))
clf = LogisticRegression().fit(x_train, y_train.values)
pred = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = DecisionTreeClassifier().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = GaussianNB().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)

Categories