Visualizing Manifold Learning MNIST digit data fails - python

I am doing some exercises with MNIST digits data but it fails when I try to visualize it. The exercise is from a book BTW. So I import the the dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
mnist.data.shape
then I just plot part of the data
fig, ax = plt.subplots(6, 8, subplot_kw=dict(xticks=[], yticks=[]))
for i, axi in enumerate(ax.flat):
axi.imshow(mnist.data[1250 * i].reshape(28, 28), cmap='gray_r')
then I perform my analysis on 1/30th of the data
# use only 1/30 of the data: full dataset takes a long time!
data = mnist.data[::30]
target = mnist.target[::30]
model = Isomap(n_components=2)
proj = model.fit_transform(data)
plt.scatter(proj[:, 0], proj[:, 1], c=target.astype(int),
cmap=plt.cm.get_cmap('jet', 10)) # need to convert target into int
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5);
I am only interested in the 1 from the dataset and I want to see those and this were I get the error. Here is what I run
from sklearn.manifold import Isomap
# Choose 1/4 of the "1" digits to project
data = mnist.data[mnist.target == 1][::4]
fig, ax = plt.subplots(figsize=(10, 10))
model = Isomap(n_neighbors=5, n_components=2, eigen_solver='dense')
plot_components(data, model, images=data.reshape((-1, 28, 28)),
ax=ax, thumb_frac=0.05, cmap='gray_r')
this results in a
ValueError: Found array with 0 sample(s) (shape=(0, 784)) while a minimum of 1 is required.
I don't understand why the array is empty?

Target values for mnist data are strings and not integers.
Just change this line:
data = mnist.data[mnist.target == 1][::4]
to:
data = mnist.data[mnist.target == '1'][::4]

Related

How do I export the point cloud test data on the validation set?

I am using the point nets code for my own dataset and they do have an output but it's only ever for 8 random data pieces. How do I take this code and export all of the datasets after the validation set goes through the model? (I do not need the images of the point cloud itself) an example of output would be just like an excel file with the prediction label versus the actual label but of the whole validation set not just the 8.
This is the code:
history = model.fit(train_dataset, epochs=num_epocs, validation_data=test_dataset)
#################################################
########## Visualizing the predictions ##########
#################################################
data = test_dataset.take(1)
points, labels = list(data)[0]
points = points[:8, ...]
labels = labels[:8, ...]
# run test data through model
preds = model.predict(points)
preds = tf.math.argmax(preds, -1)
points = points.numpy()
### plot points with predicted class and label
fig = plt.figure(figsize=(10, 15))
for i in range(8):
ax = fig.add_subplot(2, 4, i + 1, projection="3d")
ax.scatter(points[i, :, 0], points[i, :, 1], points[i, :, 2])
ax.set_title(
"pred: {:}, label: {:}".format(
CLASS_MAP[preds[i].numpy()], CLASS_MAP[labels.numpy()[i]]
)
)
ax.set_axis_off()
plt.show()

How to implement kmeans clustering as a feature for classification techniques in SVM?

Ive already created a clustering and saved the model but im confused what should i do with this model and how to use it as a feature for classification.
This clustering is based on the coordinate of a crime place. after the data has been clustered, i want to use the clustered model as features in SVM.
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import xlrd
import pickle
import tkinter as tk
from tkinter import *
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
#kmeans section
#Creating and labelling latitudes of X and Y and plotting it
data=pd.read_excel("sanfrancisco.xlsx")
x1=data['X']
y1=data['Y']
X = np.array(list(zip(x1,y1)))
# Elbow method
from sklearn.cluster import KMeans
wcss = [] #empty string
# to check in range for 10 cluster
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++') # will generate centroids
kmeans.fit(X)
wcss.append(kmeans.inertia_) # to find euclidean distance
plot1 = plt.figure(1)
plt.xlabel("Number of Clusters")
plt.ylabel("Euclidean Distance")
plt.plot(range(1,11), wcss)
k = 3
# data visual section.. Eg: how many crimes in diff month, most number of crime in a day in a week
# most number crime in what address, most number of crimes in what city, how many crime occur
# in how much time. , etc..
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x,C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)
# n_clustersr takes numbers of clusters, init chooses random data points for the initial centroids
# in default sckit provides 10 times of count and chooses the best one, in order to elak n_init assigned to 1
model = KMeans(n_clusters=k, init='random', n_init=1)
model.fit_transform(X)
centroids = model.cluster_centers_ # final centroids
rgb_colors = {0.: 'y',
1.: 'c',
2.: 'fuchsia',
}
if k == 4:
rgb_colors[3.] = 'lime'
if k == 6:
rgb_colors[3.] = 'lime'
rgb_colors[4.] = 'orange'
rgb_colors[5.] = 'tomato'
new_labels = pd.Series(model.labels_.astype(float)) # label that predicted by kmeans
plot2 = plt.figure(2)
plt.scatter(x1, y1, c=new_labels.map(rgb_colors), s=20)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='black', s=200 )
plt.xlabel('Final Cluster Centers\n Iteration Count=' +str(model.n_iter_)+
'\n Objective Function Value: ' +str(model.inertia_))
plt.ylabel('y')
plt.title("k-Means")
plt.show()
# save the model to disk
filename = 'clusteredmatrix.sav'
pickle.dump(model, open(filename,'wb'))
Your problem is not much clear, but if you want to see the behavior of clusters, I recommend you to use a tool like Weka, so that you can freely cluster them and get meaningful inferences before going into complex coding stuff!

Python ValueError. Don't understand error or how to fix

I am following the tutorial here; https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/#comment-155692
Instead of using the provided dataset I am using one needed for my assignment.
The code used is
#import packages
import pandas as pd
import numpy as np
#to plot within notebook
import matplotlib.pyplot as plt
%matplotlib inline
#setting figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10
#for normalizing data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
#read the file
df = pd.read_csv('C:/Users/Usert/Downloads/stock-20050101-to-20171231/stock-20050101-to-20171231/IBM_2006-01-01_to_2018-01-01.csv')
#print the head
df.head()
#setting index as date
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.index = df['Date']
#plot
plt.figure(figsize=(16,8))
plt.plot(df['Close'], label='Close Price history')
#creating dataframe with date and the target variable
data = df.sort_index(ascending=True, axis=0)
new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])
for i in range(0,len(data)):
new_data['Date'][i] = data['Date'][i]
new_data['Close'][i] = data['Close'][i]
#splitting into train and validation
train = new_data[:987]
valid = new_data[987:]
new_data.shape, train.shape, valid.shape
((1235, 2), (987, 2), (248, 2))
train['Date'].min(), train['Date'].max(), valid['Date'].min(), valid['Date'].max()
#make predictions
preds = []
for i in range(0,248):
a = train['Close'][len(train)-248+i:].sum() + sum(preds)
b = a/248
preds.append(b)
#calculate rmse
rms=np.sqrt(np.mean(np.power((np.array(valid['Close'])-preds),2)))
rms
#plot
valid['Predictions'] = 0
valid['Predictions'] = preds
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
This runs fine until "#Calculate RMSE" when it hits the error.
File "<ipython-input-92-1256d885493e>", line 65, in <module>
rms=np.sqrt(np.mean(np.power((np.array(valid['Close'])-preds),2)))
ValueError: operands could not be broadcast together with shapes (2033,) (248,)
Using "print(valid.shape)" and "print(len(preds))" as requested returns "(604, 3)" and "248".
Any idea how I change the numbers to fit my dataset as each time I change the numbers I create more errors?
Just FYI;
The dataset I am using has 7 columns named "Date, Open, High, Low, Close, Volume and Name" with 3021 rows of data including headers.
Whilst the one in the tutorial has 8 columns being "date, open, high, low, last, close, total_trade_quantity, and turnover" with 1236 rows including headers.

Splitting coef into arrays applicable for multi class

I use this function to plot the best and worst features (coef) for each label.
def plot_coefficients(classifier, feature_names, top_features=20):
coef = classifier.coef_.ravel()
for i in np.split(coef,6):
top_positive_coefficients = np.argsort(i)[-top_features:]
top_negative_coefficients = np.argsort(i)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
# create plot
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in i[top_coefficients]]
plt.bar(np.arange(2 * top_features), i[top_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
Applying it to sklearn.LinearSVC:
if (name == "LinearSVC"):
print(clf.coef_)
print(clf.intercept_)
plot_coefficients(clf, cv.get_feature_names())
The CountVectorizer used has a dimension of (15258, 26728).
It's a multi-class decision problem with 6 labels. Using .ravel returns a flat array with a length of 6*26728=160368. Meaning that all indicies that are higher than 26728 are out of bound for axis 1. Here are the top and bottom indices for one label:
i[ 0. 0. 0.07465654 ... -0.02112607 0. -0.13656274]
Top [39336 35593 29445 29715 36418 28631 28332 40843 34760 35887 48455 27753
33291 54136 36067 33961 34644 38816 36407 35781]
i[ 0. 0. 0.07465654 ... -0.02112607 0. -0.13656274]
Bot [39397 40215 34521 39392 34586 32206 36526 42766 48373 31783 35404 30296
33165 29964 50325 53620 34805 32596 34807 40895]
The first entry in the "top" list has the index 39336. This is equal to the entry 39337-26728=12608 in the vocabulary. What would I need to change in the code to make this applicable?
EDIT:
X_train = sparse.hstack([training_sentences,entities1train,predictionstraining_entity1,entities2train,predictionstraining_entity2,graphpath_training,graphpathlength_training])
y_train = DFTrain["R"]
X_test = sparse.hstack([testing_sentences,entities1test,predictionstest_entity1,entities2test,predictionstest_entity2,graphpath_testing,graphpathlength_testing])
y_test = DFTest["R"]
Dimensions:
(15258, 26728)
(15258, 26728)
(0, 0) 1
...
(15257, 0) 1
(15258, 26728)
(0, 0) 1
...
(15257, 0) 1
(15258, 26728)
(15258L, 1L)
File "TwoFeat.py", line 708, in plot_coefficients
colors = ["red" if c < 0 else "blue" for c in i[top_coefficients]]
MemoryError
First, is it necessary you have to use ravel()?
LinearSVC (or in fact any other classifier which has coef_) gives out coef_ in a shape:
coef_ : array, shape = [n_features] if n_classes == 2 else [n_classes, n_features]
Weights assigned to the features (coefficients in the primal problem).
So this has number of rows equal to the classes, and number of columns equal to features. For each class, you just need to access right row. The order of classes will be available from classifier.classes_ attribute.
Secondly, the indenting of your code is wrong. The code in which plot should be inside the for loop to plot for each class. Currently its outside the scope of for loop, so only will print for last class.
Correcting these two things, here's a sample reproducible code to plot the top and bottom features for each class.
def plot_coefficients(classifier, feature_names, top_features=20):
# Access the coefficients from classifier
coef = classifier.coef_
# Access the classes
classes = classifier.classes_
# Iterate the loop for number of classes
for i in range(len(classes)):
print(classes[i])
# Access the row containing the coefficients for this class
class_coef = coef[i]
# Below this, I have just replaced 'i' in your code with 'class_coef'
# Pass this to get top and bottom features
top_positive_coefficients = np.argsort(class_coef)[-top_features:]
top_negative_coefficients = np.argsort(class_coef)[:top_features]
# Concatenate the above two
top_coefficients = np.hstack([top_negative_coefficients,
top_positive_coefficients])
# create plot
plt.figure(figsize=(10, 3))
colors = ["red" if c < 0 else "blue" for c in class_coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), class_coef[top_coefficients], color=colors)
feature_names = np.array(feature_names)
# Here I corrected the start to 0 (Your code has 1, which shifted the labels)
plt.xticks(np.arange(0, 1 + 2 * top_features),
feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
Now just use this method as you like:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space']
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
vectorizer = CountVectorizer()
# Just to replace classes from integers to their actual labels,
# you can use anything as you like in y
y = []
mapping_dict = dict(enumerate(dataset.target_names))
for i in dataset.target:
y.append(mapping_dict[i])
# Learn the words from data
X = vectorizer.fit_transform(dataset.data)
clf = LinearSVC(random_state=42)
clf.fit(X, y)
plot_coefficients(clf, vectorizer.get_feature_names())
Output from above code:
'alt.atheism'
'comp.graphics'
'sci.space'
'talk.religion.misc'

Classifier training long time due to the size data

I have a problem to train my classifier.
I have 10 different kinds of music genres, each genre with 100 songs, after making an Mfccs I have a numpy array of (1293, 20)
If all together with np.vstack I have an array of (1293000, 20) and another for the labels.
When I run model.fit (features, labels), it takes a lot of time.
I have also tried with:
from sklearn.manifold import TSNE
X_embedded = TSNE (n_components = 2).fit_transform(features)
X_embedded.shape
I've tried to reduce the songs from 1000 to 100 but it's still taking a long time.
Any idea how I can classify songs with arrays with so much data?
I put some code:
scaler = sklearn.preprocessing.StandardScaler()
y, sr = librosa.load('EXAMPLE1')
mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=20).T
mfcc_scaled = scaler.fit_transform(mfcc)
mfcc_scaled.shape # (1293, 20)
y, sr = librosa.load('/Users/josetorronteras/AnacondaProjects/Neural-Networks/genres/pop/pop.00044.au')
mfcc2 = librosa.feature.mfcc(y, sr=sr, n_mfcc=20).T
mfcc_scaled2 = scaler.fit_transform(mfcc2)
mfcc_scaled2.shape # (1293, 20)
tmp_arr = []
tmp_arr.append(mfcc_scaled)
tmp_arr.append(mfcc_scaled2)
mafcc_list = np.vstack(tmp_arr)
mafcc_list.shape # (2586, 20)
a0 = np.zeros(len(mfcc_scaled))
a1 = np.ones(len(mfcc_scaled2))
labels = np.concatenate((a0, a1))
labels.shape # (2586,)
Thanks

Categories