Need Decision Tree Visualization Using Python GraphViz Using 10-fold - python

I have a code, I need to visualize the output of the decision tree using GraphViz in Python. 10 fold cross-validation is employed for getting the evaluation metrics that are quite adequate. Just need guide for graphviz
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn import tree
data = pd.read_csv("goddess_2.csv")
label_Label = LabelEncoder()
data["Label"] = label_Label.fit_transform(data['Label'])
X = data.drop("Label", axis = 1)
y = data['Label']
#Decision Tree Binary Class Classifier Model Building
model = DecisionTreeClassifier()
scoring = ['accuracy','precision_weighted', 'recall_weighted','f1_weighted']
scores = cross_validate(model, X, y, cv=10,scoring=scoring)
Accuracy = scores['test_accuracy'].mean()
Precision = scores['test_precision_weighted'].mean()
Recall = scores['test_recall_weighted'].mean()
F1Score= scores['test_f1_weighted'].mean()
print("********** Decision Tree *********")
print("\n")
print("Accuracy,",round(Accuracy * 100,3))
print("\n")
print("Precision,",round(Precision * 100,4))
print("\n")
print("Recall,",round(Recall * 100,4))
print("\n")
print("F1-Score,",round(F1Score * 100,4))
print("\n")
clf = DecisionTreeClassifier(random_state=1234)
model = clf.fit(X, y)
text_representation = tree.export_text(clf)
print(text_representation)

Related

How to properly use SMOTE for data balancing

I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.
Below is the code
import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile
tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()
num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})
# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)
# encoding all categorical variables using one hot encoding
tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
'PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies',
'Contract','PaperlessBilling','PaymentMethod'])
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']
# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']
# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
model_xgb_1 = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod.score(X_train, y_train)))
y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))
from imblearn.over_sampling import SMOTE
smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)
model_xgb_smote = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))
y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))

Buffer dtype mismatch, expected 'SIZE_t' but got 'long long'

I developed 3 ML models in spyder, they are Linear Regression, Polynomial Regression, and Random Forest Regression. In sypder all of them worked well. However when I deployed on Django for creating a web app, Random Forest was raising " ValueError: Buffer type mismatch, expected 'SIZE_t' but got 'long long' ". (I tried removing randomforest and the other two models worked well).
Check this out first:-
Model Developed in Sypder
"""****************** Import Lib ******************"""
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
"""****************** Loading dataset ******************"""
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
"""****************** Data Preprocessing ******************"""
""" Data Analysis """
# Check Null
dataset.isnull().sum()
# Calculate X and y
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values.reshape(-1,1)
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
""" Visualizing Data """
corr = dataset.corr()
sns.heatmap(corr, annot=True, cmap='Blues')
sns.pairplot(dataset)
"""****************** Regression Models ******************"""
""" Linear Regression """
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)
cv_linear = cross_val_score(estimator = regressor_linear, X=X_train, y=y_train, cv=10)
""" Polynomial Regression """
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_train)
cv_poly2 = cross_val_score(estimator=regressor_poly2, X=X_poly, y=y_train, cv=10)
""" Random Forest Regression """
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
regressor_rf.fit(X_train, y_train.ravel())
cv_rf = cross_val_score(estimator=regressor_rf, X=X_train, y=y_train.ravel(), cv=10)
"""****************** Measuring the Error ******************"""
models=[
('Linear Regression', cv_linear.mean()),
('Polynomial Regression (2)', cv_poly2.mean()),
('Random Forest Regression', cv_rf.mean())
]
cv_scores = pd.DataFrame(data=models, columns=['Model','CV Score'])
"""****************** Dump ******************"""
from sklearn.externals import joblib
joblib.dump(regressor_linear,'regressor_linear_jb')
joblib.dump(regressor_poly2,'regressor_poly2_jb')
joblib.dump(regressor_rf,'regressor_rf_jb')
Django Implementation Code
from django.shortcuts import render
from django.http import HttpResponse
import json
from django.http import JsonResponse
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
# Create your views here.
# ML Code
regressor_linear = joblib.load('./models/regressor_linear_jb')
regressor_poly2 = joblib.load('./models/regressor_poly2_jb')
regressor_rf = joblib.load('./models/regressor_rf_jb')
# ML Code End
def predict(request):
temp_data = [
0.16902,
0,
25.65,
0,
0.581,
5.986,
88.4,
1.9929,
2,
188,
19.1,
385.02,
14.81,
]
temp_df = pd.DataFrame(temp_data).transpose()
predict = {}
# Linear Regression
predict['Linear Regressor'] = round(regressor_linear.predict(temp_df)[0, 0], 2)
# Polynomial Regression.
regressor_poly = PolynomialFeatures(degree=2)
temp_df_poly = regressor_poly.fit_transform(temp_df)
predict['Polynomial Regressor'] = round(regressor_poly2.predict(temp_df_poly)[0, 0], 2)
# Random Forest Regression
predict['Random Forest Regressor'] = round(regressor_rf.predict(temp_df)[0],2)
return JsonResponse(predict)
Switch environment for Django to anaconda and this will be resolved
Jupyter notebook was using anaconda environment while Django was using different environment which was installed on system
(Major problem --> one was 32 bit while other was 64 bit)

how to generate the confusion matrix through cross validation in python?

I am using the iris flower dataset to do the sorting. I need to make a confusion matrix through cross validation (fold = 10) but I don't know how to do it. I generated the confusion matrix of only one round.
# I am using TPOT autoML library for python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import LabelEncoder
tpot_data = pd.read_csv('iris.csv')
tpot_data = tpot_data.apply(LabelEncoder().fit_transform)
features = tpot_data.drop('species', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['species'].values, random_state=10)
exported_pipeline = make_pipeline(StackingEstimator(estimator=GaussianNB()),
MultinomialNB(alpha=0.01, fit_prior=False)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(testing_target, results))
pd.crosstab(testing_target, results, rownames=['Actual Class'], colnames=['Predicted Class'])
from sklearn.model_selection import cross_val_score
array_cross_val_score = cross_val_score(estimator=exported_pipeline, X=training_features,
y=training_target, cv=10, scoring='accuracy')
# I would like the confusion matrix to be based on the average cross-validation
np.mean(array_cross_val_score)

How to know whether i am overfitting/underfitting my data?

So i have to build a regression model to predict wine quality based on 11 inputs. Currently i am evaluating the Mean Squared Error, Mean absolute error and R2 scores of various algorithms. I want to make a decision on which algorithm to use, but before i do, i want to make sure my data is not being overfitted/underfitted. Below is the link to the dataset i use (its a bit different but the data is exactly the same) as well as my entire code.
Any help is greatly appreciated!
Data:
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
Also, the kagggle link where i copied most of my code from:
https://www.kaggle.com/jhansia/regression-models-analysis-on-the-wine-quality
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
wine = pd.read_csv('wineQualityReds.csv', usecols=lambda x: 'Unnamed' not in x,)
wine.head()
y = wine.quality
X = wine.drop('quality',axis = 1)
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(X,y,random_state = 0, stratify = y)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
models = []
models.append(('DecisionTree', DecisionTreeRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('GradienBoost', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
names = []
for name,model in models:
kfold = model_selection.KFold(n_splits=5,random_state=2)
cv_results = model_selection.cross_val_score(model,train_x_scaled,train_y, cv= kfold, scoring = 'neg_mean_absolute_error')
names.append(name)
msg = "%s: %f" % (name, -1*(cv_results).mean())
print(msg)
model = RandomForestRegressor()
model.fit(train_x_scaled,train_y)
pred_y = model.predict(test_x_scaled)
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(test_y, pred_y))
print('Mean Absolute Error:', metrics.mean_absolute_error(test_y, pred_y))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_y, pred_y)))
print('R2:', metrics.r2_score(test_y, pred_y))
You can use cross validation on the data sets to find whether it is over fitting or under fitting.

Feature extraction per fold with Scikit

for a machine learning experiment I have to perform feature selection. I have no division in training and test set because of 10-fold cross validation. Someone told me that I have to do the feature selection per fold. But I have no idea how to do that. Here is a part of my code.
vec = DictVectorizer()
X = vec.fit_transform(instances) # No train/ test set, because we'll use 10-fold cross validation
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X) # To make sure everything is on the same scale
enc = LabelEncoder()
y = enc.fit_transform(labels)
#feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif
feat_sel = SelectKBest(mutual_info_classif, k=200)
X_fs = feat_sel.fit_transform(X_scaled, y)
#train a classifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
y_pred = model_selection.cross_val_predict(clf, X_fs, y, cv=10)
Can someone help me with the selection per fold?
Answering the second question that you posted.
You can use cross validation and see the results:
Do:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFECV
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
feat_sel = SelectKBest(mutual_info_classif, k=200)
clf = MultinomialNB()
pipe = Pipeline([('mutual_info',feat_sel), ('naive_bayes',clf)])
scores = cross_val_score(pipe, X_scaled, y, cv =10, scoring = 'accuracy')
print(np.mean(scores))
You can use Pipeline, join the feature selector and the classifier into a pipeline and cross-validate the pipeline.
Reference: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

Categories