I'm following this example from tsfresh: Multiclass . It is a classification example using feature extraction and a decision tree classifier.
import matplotlib.pylab as plt
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
download_har_dataset()
df = load_har_dataset()
y = load_har_classes()
df["id"] = df.index
df = df.melt(id_vars="id", var_name="time").sort_values(["id", "time"]).reset_index(drop=True)
X = extract_features(df[df["id"] < 500], column_id="id", column_sort="time", impute_function=impute)
X_train, X_test, y_train, y_test = train_test_split(X, y[:500], test_size=.2)
classifier_full = DecisionTreeClassifier()
classifier_full.fit(X_train, y_train)
Now I am trying to visualize the classification report using:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB
from yellowbrick.datasets import load_occupancy
from yellowbrick.classifier import classification_report
classes=np.unique(y)
classes=classes.tolist()
classes=list(map(str,classes))
visualizer = classification_report(GaussianNB(), X_train, y_train, X_test, y_test, classes=classes, support=True)
However when running the script, it gives:
ModelError: could not decode [1 2 3 4 5 6] y values to [1 2 3 4 5 6] labels
Anyone knows why it happens?
I tried comparing with this example: Scikit classification example, where I found the classification_report, and I think the problem is in the data structure, but I can't find any difference? Any help is appreciated. Thank you!
Related
I am creating a decision tree with 19 columns with 2 values each and while I am splitting the tree and I get an Attribute Error.
Error states:
groups = clf.groupby('column1')
AttributeError: 'DecisionTreeClassifier' object has no attribute 'groupby'
import pandas as PD
import numpy as np
from matplotlib import pyplot
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
data_set= PD.read_csv('hw6.data.csv.gz')
x= data_set.iloc[:, :10].values
y= data_set.iloc[:, 10]
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=.2, random_state=42)
clf =DecisionTreeClassifier()
clf.fit(x_train, y_train)
split_values = clf.tree_.threshold
impurity = clf.tree_.impurity
print(data_set)
groups = clf.groupby('column1')
def split_group(group):
subgroup1= group[group['column2'] =='value1']
subgroup2= group[group['column2'] =='value2']
return [subgroup1, subgroup2]
split_data = groups.apply(split_group)
print(split_data)
clf is not a dataframe but a decision tree instance. So change the code as follows:
before
groups = clf.groupby('column1')
after
groups = data_set.groupby('column1')
How can I use this dataset "MC1" to plot a KNN decision boundary figure?
Here is my code, I have tried to use iloc and loc but did not work
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.contrib.classifier import DecisionViz
from yellowbrick.features import RadViz
from yellowbrick.style import set_palette
set_palette('flatui')
data_set = pd.read_csv('MC1.csv')
X, y = data_set
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42)
visualizer = RadViz(size=(500, 400))
viz = DecisionViz(
KNeighborsClassifier(5), title="Nearest Neighbors",classes=['Y', 'N']
)
viz.fit(X_train, y_train)
viz.draw(X_test, y_test)
viz.show()
I was reading this tutorial on LIME which showed how to visualise the result of a prediction by executing this code
import pandas as pd
import numpy as np
import lime
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(data=boston.data, columns = boston.feature_names)
boston_df["Price"] = boston.target
from sklearn.model_selection import train_test_split
X, Y = boston.data, boston.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.90, test_size=0.1, random_state=123, shuffle=True)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(X_train, mode="regression", feature_names= boston.feature_names)
idx = random.randint(1, len(X_test))
explanation = explainer.explain_instance(X_test[idx], lr.predict, num_features=len(boston.feature_names))
explanation.show_in_notebook()
The code produced this graph
My question is: how can I change the text size for the words negative and positive in that graph? I searched the docs but couldn't find any info on that matter.
The problem is I am getting two totally different results when I run the DTC algorithm, I just want to make sure that I am writing the cross validation - K Fold in a correct way or to understand why the result of the K fold is too much less than the normal one.
I've tried to run the codes for getting result from both normal accuracy and K fold accuracy the code is below:
from scipy.signal import butter, lfilter
import numpy as np
import pandas as pd
import pandas
from sklearn import preprocessing
from scipy.fftpack import fft
import pickle
import numpy
from pandas import Series
from numpy.random import randn
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = numpy.array(y).astype(numpy.int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_train, y_train)
y_predict_2 = clf2.predict(X_test)
print("DTC Accuracy : ")
print(accuracy_score(y_test, y_predict_2)*100)
DTC Accuracy :
97.6302083333333
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, x, y, cv=10, scoring='accuracy')
print(scores.mean()*100)
35.331452470904985
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean()*100)
97.34356
However, in the cross validation part, when I put X_train instead of x and y_train instead of y, the accuracy again rises to 97.
I am wondering which one I need to use (x and y) or (X_train adn y_train) will be the correct and common sense cross validation.
Try to shuffle your data and reduce the cross validation folds.
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = y.values.astype(np.int32).reshape(-1, 1)
x, y = shuffle(x, y, random_state=42)
DTC = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(DTC, x, y, cv=3, scoring='accuracy')
print(scores.mean()*100)
I want to estimate the model from the data I've used here in scikit-learn. I am using the DecisionTreeClassifier.score function but when running the code I'll receive an ValueError:
Can't handle mix of continuous and multiclass.
Here is the code I use:
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
nba = pd.read_excel(r"C:\Users\user\Desktop\nba.xlsx")
X = nba.drop('平均得分', axis = 1)
y = nba['平均得分']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.20)
nba_tree = DecisionTreeClassifier()
nba_tree.fit(X_train, y_train.astype('int'))
y_pred = nba_tree.predict(X_test)
nba_tree.score(X_test, y_test)
It looks like your target variable 平均得分 is a continuous variable. Probably you are try to solve a regression problem. If that is the case then try DecisionTreeRegressor instead of DecisionTreeClassifier.