I was reading this tutorial on LIME which showed how to visualise the result of a prediction by executing this code
import pandas as pd
import numpy as np
import lime
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(data=boston.data, columns = boston.feature_names)
boston_df["Price"] = boston.target
from sklearn.model_selection import train_test_split
X, Y = boston.data, boston.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.90, test_size=0.1, random_state=123, shuffle=True)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(X_train, mode="regression", feature_names= boston.feature_names)
idx = random.randint(1, len(X_test))
explanation = explainer.explain_instance(X_test[idx], lr.predict, num_features=len(boston.feature_names))
explanation.show_in_notebook()
The code produced this graph
My question is: how can I change the text size for the words negative and positive in that graph? I searched the docs but couldn't find any info on that matter.
Related
How can I use this dataset "MC1" to plot a KNN decision boundary figure?
Here is my code, I have tried to use iloc and loc but did not work
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.contrib.classifier import DecisionViz
from yellowbrick.features import RadViz
from yellowbrick.style import set_palette
set_palette('flatui')
data_set = pd.read_csv('MC1.csv')
X, y = data_set
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42)
visualizer = RadViz(size=(500, 400))
viz = DecisionViz(
KNeighborsClassifier(5), title="Nearest Neighbors",classes=['Y', 'N']
)
viz.fit(X_train, y_train)
viz.draw(X_test, y_test)
viz.show()
Trying to fit a linear kernel ridge regression model on a dataset with 8 features.
import pandas as pd
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
urllib.request.urlretrieve(url, './Concrete_Data.xls')
data = pd.read_excel('./Concrete_Data.xls')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
new_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer","CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(data.columns)
mapper = {}
for i,name in enumerate(curr_col_names):
mapper[name] = new_col_names[i]
data = data.rename(columns=mapper)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)
y_pred_kr = kr.predict(y_test)
When I try to run the code, there is an error that says the expected array is meant to be 2D but is a 1D array. Could someone let me know what I am possibly doing wrong?
I am trying to use scikitplot.metrics.plot_calibration_curve to plot calibration curves for my models and would like to change the line-type (eg. dashed, solid, dotted) in the resulting charts.
The simplest reproducible example I could make is below.
import scikitplot as skplt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# load the breast_cancer dataset and split it into train and test sets
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
rf = RandomForestClassifier()
lr = LogisticRegression()
rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
clf_names = ['Random Forest', 'Logistic Regression']
probas_list = [rf_probas, lr_probas]
skplt.metrics.plot_calibration_curve(y_test,
probas_list,
clf_names)
Which gives exactly what I want:
But I would just like to be able to change the line-types, so that the chart can be printed in black and white.
I want to implement the cross validation in Random Forest Regressor in my data set. I want to know if my code is correct or not? Is this the way to cross validate?
Here is my sample data:
Wavelength Phase_velocity Shear_wave_velocity
1.50 202.69 240.73
1.68 192.72 240.73
1.79 205.54 240.73
........
Here is my code:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold,train_test_split,cross_val_score
df = pd.read_csv("5.5-6.csv")
df.head()
X = df[['wavelength', 'phase velocity']]
y = df['shear wave velocity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (len(X_train),len(X_test),len(y_train),len(y_test))
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)
rf = RandomForestRegressor(n_estimators=30000)
rf.fit(X_train, y_train)
results = cross_val_score(rf, X_train, y_train, cv=kfold) #Cross validation on training set
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (rf.predict(X_test)) #array_output
print (y_test)
print (rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
from sklearn.metrics import mean_absolute_error
print (mean_absolute_error(y_test,y_pred))
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(rmse)
The problem is I am getting two totally different results when I run the DTC algorithm, I just want to make sure that I am writing the cross validation - K Fold in a correct way or to understand why the result of the K fold is too much less than the normal one.
I've tried to run the codes for getting result from both normal accuracy and K fold accuracy the code is below:
from scipy.signal import butter, lfilter
import numpy as np
import pandas as pd
import pandas
from sklearn import preprocessing
from scipy.fftpack import fft
import pickle
import numpy
from pandas import Series
from numpy.random import randn
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = numpy.array(y).astype(numpy.int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_train, y_train)
y_predict_2 = clf2.predict(X_test)
print("DTC Accuracy : ")
print(accuracy_score(y_test, y_predict_2)*100)
DTC Accuracy :
97.6302083333333
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, x, y, cv=10, scoring='accuracy')
print(scores.mean()*100)
35.331452470904985
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean()*100)
97.34356
However, in the cross validation part, when I put X_train instead of x and y_train instead of y, the accuracy again rises to 97.
I am wondering which one I need to use (x and y) or (X_train adn y_train) will be the correct and common sense cross validation.
Try to shuffle your data and reduce the cross validation folds.
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = y.values.astype(np.int32).reshape(-1, 1)
x, y = shuffle(x, y, random_state=42)
DTC = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(DTC, x, y, cv=3, scoring='accuracy')
print(scores.mean()*100)