Having problems with code "plot_2d_classification" - python

I am working my way through the online text "Applied Machine Learning in Python" at https://amueller.github.io/aml/01-ml-workflow/02-supervised-learning.html
Currently, I am working through the chapter on "Supervised Learning". The following snippet of code occurs toward the end of the chapter:
fig, axes = plt.subplots(2, 2, figsize=(8, 8))
for ax, n_neighbors in zip(axes.ravel(), [3, 5, 11, 33]):
ax.set_title(f"n_neighbors={n_neighbors}")
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train[['mean compactness', 'worst concave points']], y_train)
ax.scatter(X_train['mean compactness'], X_train['worst concave points'], c=y_train, cmap='bwr', s=2)
plot_2d_classification(clf, np.array(X_train[['mean compactness', 'worst concave points']]), ax=ax, alpha=.4, cmap='bwr')
ax.set_aspect("equal")
ax.set_xlim(0.05, 0.17)
ax.set_ylim(0.06, 0.2)
When I copy and paste it into Jupyter Notebook, it returns the following error:
NameError Traceback (most recent call last)
Input In [24], in <cell line: 2>()
4 clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train[['mean compactness', 'worst concave points']], y_train)
5 ax.scatter(X_train['mean compactness'], X_train['worst concave points'], c=y_train, cmap='bwr', s=2)
6 plot_2d_classification(clf, np.array(X_train[['mean compactness', 'worst concave points']]), ax=ax, alpha=.4, cmap='bwr')
7 ax.set_aspect("equal")
8 ax.set_xlim(0.05, 0.17)
NameError: name 'plot_2d_classification' is not defined
It is supposed to return a set of the following four plots.
enter image description here
I have done a Google search using the term "plot_2d_classification" and received a single page of links, none of which provide any insight.
I found the following two posts by A. Mueller for
plot_2d_separator.py https://github.com/amueller/mglearn/blob/master/mglearn/plot_2d_separator.py
which requires
plot_helpers.py https://github.com/amueller/mglearn/blob/master/mglearn/plot_helpers.py
Cutting and Pasting the snippet of code above returns additional errors such that none of the three sets of code runs successfully.
Any suggestions?

The error indicates you didn't define the function plot_2d_classification
Just copy the function plot_2d_classification in the plot_2d_separator.py and make a little modification. The full codes:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None,
alpha=1, cm=''):
# multiclass
if eps is None:
eps = X.std() / 2.
if ax is None:
ax = plt.gca()
x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
xx = np.linspace(x_min, x_max, 1000)
yy = np.linspace(y_min, y_max, 1000)
X1, X2 = np.meshgrid(xx, yy)
X_grid = np.c_[X1.ravel(), X2.ravel()]
decision_values = classifier.predict(X_grid)
ax.imshow(decision_values.reshape(X1.shape), extent=(x_min, x_max,
y_min, y_max),
aspect='auto', origin='lower', alpha=alpha, cmap=cm)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())
cancer = load_breast_cancer(as_frame=True)
cancer_df = cancer.frame
data_train, data_test = train_test_split(cancer_df)
X_train = data_train.drop(columns='target')
y_train = data_train.target
X_test = data_test.drop(columns='target')
y_test = data_test.target
fig, axes = plt.subplots(2, 2, figsize=(8, 8))
for ax, n_neighbors in zip(axes.ravel(), [3, 5, 11, 33]):
ax.set_title(f"n_neighbors={n_neighbors}")
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train[['mean compactness', 'worst concave points']], y_train)
ax.scatter(X_train['mean compactness'], X_train['worst concave points'], c=y_train, cmap='bwr', s=2)
plot_2d_classification(clf, np.array(X_train[['mean compactness', 'worst concave points']]), ax=ax, alpha=.4, cm='bwr')
ax.set_aspect("equal")
ax.set_xlim(0.05, 0.17)
ax.set_ylim(0.06, 0.2)
Then run the code, you'll get the figure:

Related

How to plot 3d scatter with QDA decision boundary?

With generated data, I am trying to plot 3d decision boundary of QDA in 3d spaces. I used sklearn library to calculate QDA, but couldn't plot 3d QDA decision boundary.
Here is my code:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
# Generate a random dataset for classification
X, y = make_classification(n_features=3, n_informative=2, n_redundant=0, n_repeated=0, random_state=0)
# Create and fit a QDA classifier
qda = QuadraticDiscriminantAnalysis()
qda.fit(X, y)
# Plot the decision boundary of the QDA classifier
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
z_min, z_max = X[:, 2].min() - 1, X[:, 2].max() + 1
xx, yy, zz = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1),
np.arange(z_min, z_max, 0.1))
X_grid = np.c_[xx.ravel(), yy.ravel(), zz.ravel()]
Z = qda.predict(X_grid)
Z = Z.reshape(xx.shape)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.Paired)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')
ax.set_zlabel('Z axis')
ax.plot_surface(xx, yy, zz, facecolors=plt.cm.Paired(Z), alpha=0.2)
plt.show()
Above code saying zz must be 2d instead of 3d, but I don't really get why it has to be 2d?
At the end I want to see figure with scatter plot of binary classes and decision boundary of QDA on data points.
With respect to z, this also needs to be a 2D array since Axes3D.surface_plot maps each element of the 2D array z with the 2D grid defined by x and y.
Hence, when you use your own x, y and z make sure that you use numpy.meshgrid for x and y and, then, define z = f(x,y) (e.g. the function flux_qubit_potential you show).
maybe help you: Link

Compare sampler combining over- and under-sampling ERROR

I am trying to run a comparison in my oversampling and udnersampling algorithms.
This is my y numpy array:
[0. 0. 0. ... 0. 0. 0.]
There are 1's and 0's here.
percentage of 0s is 99.57113470676805
percentage of 1s 0.4288652932319543
This is my X numpy array:
[[ 9.99139870e+00 6.87505736e-01 8.18184694e-01 5.79211424e-03
7.07254165e-02 -4.96940863e-02]
[ 1.45842820e-02 8.90971353e-01 5.40819886e-02 4.78689597e-03
-7.58403812e-01 1.25082521e-01]
[ 1.45743243e-02 8.77439954e-01 3.24491931e-02 4.73968535e-03
-5.17675263e-02 -5.86812372e-02]
...
[ 1.81681846e-03 2.17873637e+00 7.85498395e-01 5.44274803e-04
-4.03230077e-02 2.36304861e-02]
[ 1.81637248e-03 2.22724182e+00 7.85498395e-01 5.74896405e-04
2.43415000e-01 -2.68917605e-02]
[ 1.81600743e-03 2.29634509e+00 7.85498395e-01 5.93269365e-04
1.17457969e-01 1.15348925e-03]]
There are 6 X features as you can see above but the ERROR is that there is only 2. I dont know where I can fix this error so that the graph works.
This is what I am trying to measure:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
samplers = [SMOTE(random_state=0), SMOTEENN(random_state=0), SMOTETomek(random_state=0)]
fig, axs = plt.subplots(3, 2, figsize=(15, 25))
for ax, sampler in zip(axs, samplers):
clf = make_pipeline(sampler, LinearSVC()).fit(X, y)
plot_decision_function(X, y, clf, ax[0])
plot_resampling(X, y, sampler, ax[1])
fig.tight_layout()
plt.show()
def plot_decision_function(X, y, clf, ax):
"""Plot the decision function of the classifier and the original data"""
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.4)
ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor="k")
ax.set_title(f"Resampling using {clf[0].__class__.__name__}")
def plot_resampling(X, y, sampler, ax):
"""Plot the resampled dataset using the sampler."""
X_res, y_res = sampler.fit_resample(X, y)
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k")
sns.despine(ax=ax, offset=10)
ax.set_title(f"Decision function for {sampler.__class__.__name__}")
return Counter(y_res)
The error that I am getting is simple, but I cannot find a way to fix it:
ValueError: X has 2 features per sample; expecting 10
ValueError Traceback (most recent call last)
in <module>
9 for ax, sampler in zip(axs, samplers):
10 clf = make_pipeline(sampler, LinearSVC()).fit(X, y)
---> 11 plot_decision_function(X, y, clf, ax[0])
12 plot_resampling(X, y, sampler, ax[1])
13 fig.tight_layout()

ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2

I have problem with training my model when I use 4 features. I was able to implement training using 2 firts features. But have some troubles when I use 4 features.
The suspect code is here:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:,:]
y = iris.target
tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)
tree_clf.fit(X, y)
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[0, 10, 0, 5], iris=True, legend=False, plot_training=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
if not iris:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap4, alpha=0.8)
if plot_training:
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris-Setosa")
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris-Versicolor")
plt.plot(X[:, 0][y==2], X[:, 1][y==2], "g^", label="Iris-Virginica")
plt.axis(axes)
if iris:
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
else:
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
if legend:
plt.legend(loc="lower right", fontsize=14)
plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)
plt.text(1.40, 1.0, "Depth=0", fontsize=15)
plt.text(3.2, 1.80, "Depth=1", fontsize=13)
plt.text(4.05, 0.5, "(Depth=2)", fontsize=11)
Anyone can help, pls?

How can I visualize border/decision function of two classes using scikit-learn

I am pretty new in machine learning, so I still don't understand how I can visualize the border between 2 classes in bag of words case.
I found the following exaplpe to plot data
plot a document tfidf 2D graph
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
newsgroups_train = fetch_20newsgroups(subset='train',
categories=['alt.atheism', 'sci.space'])
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
])
X = pipeline.fit_transform(newsgroups_train.data).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
plt.scatter(data2D[:,0], data2D[:,1], c=newsgroups_train.target)
plt.show()
In my project I use SVC estimator
clf = SVC(random_state=241, kernel = 'linear')
clf.fit(X,newsgroups_train.target)
I have tried to use the example
http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
but it didn't work in text clasification case
So how can I add the border of two classes to this plot?
Thank you!
The problem is that you need to select only 2 features in order to create the 2-dimensional decision surface plot. I will provide 2 examples. The first using iris data and the second using your data.
I have also written an article about this here:
https://towardsdatascience.com/support-vector-machines-svm-clearly-explained-a-python-tutorial-for-classification-problems-29c539f3ad8?source=friends_link&sk=80f72ab272550d76a0cc3730d7c8af35
In both cases, I select only 2 features in order to create the plot.
Example 1 using iris data:
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
RESULTS
Example 2 using your data:
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
newsgroups_train = fetch_20newsgroups(subset='train',
categories=['alt.atheism', 'sci.space'])
pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
X = pipeline.fit_transform(newsgroups_train.data).todense()
# Select ONLY 2 features
X = np.array(X)
X = X[:, [0,1]]
y = newsgroups_train.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
RESULTS
Important note:
In the second case, the plot is not nice since we selected randomly only 2 features to create it. One way to make it nice is the following: You could use a univariate ranking method (e.g. ANOVA F-value test) and find the best top-2 features from the 22464 that you initially have. Then using these top-2 you could create a nice separating surface plot.

Plot K-means clusters after TruncatedSVD Python

I'm trying to plot the results of running clustering on my data set but I'm getting the error:
File "cluster.py", line 93, in <module>
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
File "/usr/local/lib/python2.7/dist-packages/sklearn/cluster/k_means_.py", line 957, in predict
X = self._check_test_data(X)
File "/usr/local/lib/python2.7/dist-packages/sklearn/cluster/k_means_.py", line 867, in _check_test_data
n_features, expected_n_features))
ValueError: Incorrect number of features. Got 2 features, expected 73122
My call to fit() works fine, but the plotting is where it goes wrong.
Here's my code:
reduced_data = TruncatedSVD(n_components=2).fit_transform(X)
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1, verbose=False)
kmeans.fit(X)
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
Can anyone suggest how I can change up my code to get a diagram of the clusters?
The traceback is telling you what the issue is:
ValueError: Incorrect number of features. Got 2 features, expected 73122
The kmeans classifier was fit with 73122-dimensional train samples, therefore you cannot use kmeans to make predictions on 2-dimensional test samples.
To fix your code simply change kmeans.fit(X) to kmeans.fit(reduced_data).

Categories