Classifier training long time due to the size data - python

I have a problem to train my classifier.
I have 10 different kinds of music genres, each genre with 100 songs, after making an Mfccs I have a numpy array of (1293, 20)
If all together with np.vstack I have an array of (1293000, 20) and another for the labels.
When I run model.fit (features, labels), it takes a lot of time.
I have also tried with:
from sklearn.manifold import TSNE
X_embedded = TSNE (n_components = 2).fit_transform(features)
X_embedded.shape
I've tried to reduce the songs from 1000 to 100 but it's still taking a long time.
Any idea how I can classify songs with arrays with so much data?
I put some code:
scaler = sklearn.preprocessing.StandardScaler()
y, sr = librosa.load('EXAMPLE1')
mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=20).T
mfcc_scaled = scaler.fit_transform(mfcc)
mfcc_scaled.shape # (1293, 20)
y, sr = librosa.load('/Users/josetorronteras/AnacondaProjects/Neural-Networks/genres/pop/pop.00044.au')
mfcc2 = librosa.feature.mfcc(y, sr=sr, n_mfcc=20).T
mfcc_scaled2 = scaler.fit_transform(mfcc2)
mfcc_scaled2.shape # (1293, 20)
tmp_arr = []
tmp_arr.append(mfcc_scaled)
tmp_arr.append(mfcc_scaled2)
mafcc_list = np.vstack(tmp_arr)
mafcc_list.shape # (2586, 20)
a0 = np.zeros(len(mfcc_scaled))
a1 = np.ones(len(mfcc_scaled2))
labels = np.concatenate((a0, a1))
labels.shape # (2586,)
Thanks

Related

Overlapping chunks in Xarray dataset for Kernel operations

I try to run a 9x9 pixel kernel across a large satellite image with a custom filter. One satellite scene has ~ 40 GB and to fit it into my RAM, I'm using xarrays options to chunk my dataset with dask.
My filter includes a check if the kernel is complete (i.e. not missing data at the edge of the image). In that case a NaN is returned to prevent a potential bias (and I don't really care about the edges). I now realized, that this introduces not only NaNs at the edges of the image (expected behaviour), but also along the edges of each chunk, because the chunks don't overlap. dask provides options to create chunks with an overlap, but are there any comparable capabilities in xarray? I found this issue, but it doesn't seem like there has been any progress in this regard.
Some sample code (shortened version of my original code):
import numpy as np
import numba
import math
import xarray as xr
#numba.jit("f4[:,:](f4[:,:],i4)", nopython = True)
def water_anomaly_filter(input_arr, window_size = 9):
# check if window size is odd
if window_size%2 == 0:
raise ValueError("Window size must be odd!")
# prepare an output array with NaNs and the same dtype as the input
output_arr = np.zeros_like(input_arr)
output_arr[:] = np.nan
# calculate how many pixels in x and y direction around the center pixel
# are in the kernel
pix_dist = math.floor(window_size/2-0.5)
# create a dummy weight matrix
weights = np.ones((window_size, window_size))
# get the shape of the input array
xn,yn = input_arr.shape
# iterate over the x axis
for x in range(xn):
# determine limits of the kernel in x direction
xmin = max(0, x - pix_dist)
xmax = min(xn, x + pix_dist+1)
# iterate over the y axis
for y in range(yn):
# determine limits of the kernel in y direction
ymin = max(0, y - pix_dist)
ymax = min(yn, y + pix_dist+1)
# extract data values inside the kernel
kernel = input_arr[xmin:xmax, ymin:ymax]
# if the kernel is complete (i.e. not at image edge...) and it
# is not all NaN
if kernel.shape == weights.shape and not np.isnan(kernel).all():
# apply the filter. In this example simply keep the original
# value
output_arr[x,y] = input_arr[x,y]
return output_arr
def run_water_anomaly_filter_xr(xds, var_prefix = "band",
window_size = 9):
variables = [x for x in list(xds.variables) if x.startswith(var_prefix)]
for var in variables[:2]:
xds[var].values = water_anomaly_filter(xds[var].values,
window_size = window_size)
return xds
def create_test_nc():
data = np.random.randn(1000, 1000).astype(np.float32)
rows = np.arange(54, 55, 0.001)
cols = np.arange(10, 11, 0.001)
ds = xr.Dataset(
data_vars=dict(
band_1=(["x", "y"], data)
),
coords=dict(
lon=(["x"], rows),
lat=(["y"], cols),
),
attrs=dict(description="Testdata"),
)
ds.to_netcdf("test.nc")
if __name__ == "__main__":
# if required, create test data
create_test_nc()
# import data
with xr.open_dataset("test.nc",
chunks = {"x": 50,
"y": 50},
) as xds:
xds_2 = xr.map_blocks(run_water_anomaly_filter_xr,
xds,
template = xds).compute()
xds_2["band_1"][:200,:200].plot()
This yields:
enter image description here
You can clearly see the rows and columns of NaNs along the edges of each chunk.
I'm happy for any suggestions. I would love to get the overlapping chunks (or any other solution) within xarray, but I'm also open for other solutions.
You can use Dask's map_blocks as follows:
arr = dask.array.map_overlap(
water_anomaly_filter, xds.band_1.data, dtype='f4', depth=4, window_size=9
).compute()
da = xr.DataArray(arr, dims=xds.band_1.dims, coords=xds.band_1.coords)
Note that you will likely want to tune depth and window_size for your specific application.

Sklearn logistic regression shape error, but x, y shapes are consistent

I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm

Visualizing Manifold Learning MNIST digit data fails

I am doing some exercises with MNIST digits data but it fails when I try to visualize it. The exercise is from a book BTW. So I import the the dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
mnist.data.shape
then I just plot part of the data
fig, ax = plt.subplots(6, 8, subplot_kw=dict(xticks=[], yticks=[]))
for i, axi in enumerate(ax.flat):
axi.imshow(mnist.data[1250 * i].reshape(28, 28), cmap='gray_r')
then I perform my analysis on 1/30th of the data
# use only 1/30 of the data: full dataset takes a long time!
data = mnist.data[::30]
target = mnist.target[::30]
model = Isomap(n_components=2)
proj = model.fit_transform(data)
plt.scatter(proj[:, 0], proj[:, 1], c=target.astype(int),
cmap=plt.cm.get_cmap('jet', 10)) # need to convert target into int
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5);
I am only interested in the 1 from the dataset and I want to see those and this were I get the error. Here is what I run
from sklearn.manifold import Isomap
# Choose 1/4 of the "1" digits to project
data = mnist.data[mnist.target == 1][::4]
fig, ax = plt.subplots(figsize=(10, 10))
model = Isomap(n_neighbors=5, n_components=2, eigen_solver='dense')
plot_components(data, model, images=data.reshape((-1, 28, 28)),
ax=ax, thumb_frac=0.05, cmap='gray_r')
this results in a
ValueError: Found array with 0 sample(s) (shape=(0, 784)) while a minimum of 1 is required.
I don't understand why the array is empty?
Target values for mnist data are strings and not integers.
Just change this line:
data = mnist.data[mnist.target == 1][::4]
to:
data = mnist.data[mnist.target == '1'][::4]

feature extraction using librosa

I am using following code obtain from Github. This code extract mfccs,chroma, melspectrogram, tonnetz and spectral contrast features give output in form of feat.np. I want to extract some other features like rmse, zerocross but when i add the relevent code i give error while concatenation.
# coding= UTF-8
#
# Author: Fing
# Date : 2017-12-03
#
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import soundfile as sf
def extract_feature(file_name):
X, sample_rate = sf.read(file_name, dtype='float32')
if X.ndim > 1:
X = X[:,0]
X = X.T
# short term fourier transform
stft = np.abs(librosa.stft(X))
# mfcc
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
# chroma
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
# melspectrogram
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
# spectral contrast
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
return mfccs,chroma,mel,contrast,tonnetz
def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
features, labels = np.empty((0,193)), np.empty(0)
for label, sub_dir in enumerate(sub_dirs):
for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
try:
mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
except Exception as e:
print("[Error] extract feature error. %s" % (e))
continue
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
print(ext_features)
print(features)
features = np.vstack([features,ext_features])
# labels = np.append(labels, fn.split('/')[1])
labels = np.append(labels, label)
print("extract %s features done" % (sub_dir))
return np.array(features), np.array(labels, dtype = np.int)
def one_hot_encode(labels):
n_labels = len(labels)
n_unique_labels = len(np.unique(labels))
one_hot_encode = np.zeros((n_labels,n_unique_labels))
one_hot_encode[np.arange(n_labels), labels] = 1
return one_hot_encode
# Get features and labels
r = os.listdir("data/")
r.sort()
features, labels = parse_audio_files('data', r)
np.save('feat.npy', features)
np.save('label.npy', labels)
`
This code work fine but when i want to extract other features also like rmse,zero crossing rate etc . when i add
#rmse=np.mean(librosa.feature.rmse(y=X).T,axis=0)
i got following error
File "C:\Users\HP\Anaconda2\lib\site-packages\numpy\core\shape_base.py", line 237, in vstack
return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
ValueError: all the input array dimensions except for the concatenation axis must match exactly
how i can extract other features and concatenate also.
The shape of STFT and RMSE are different than MFCC and other features. STFT and MFCC are 2 dimensional while the others are one dimensional features.

Stocking different types of data into an 2D numpy array

I would like to know how I can store different data into a numpy array, in order to feed it to a machine Learning SVC algorithm.
My goal, is to get a dataframe of size (sample * features) like this:
With:
Feature 1 in gray containing list of size n
Feature 2 in red, containing 2D numpy array of shape (i,k)
Feature ... Something else (array for pwelch spectrum, integers, float, ...)
Feature n in blue, containing integer.
How can I do that in Python ? Is this going to be ok for sklearn ?
Here is the current error from the code bellow:
ValueError: setting an array element with a sequence.
Code:
# -*- coding: utf-8 -*-
"""----------------------------------------------------------------------------
-------------------------------- Imports --------------------------------------
----------------------------------------------------------------------------"""
import os
import pandas as pd
import numpy as np
from scipy import io as sio
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
"""----------------------------------------------------------------------------
------------------------------ Parameters -------------------------------------
----------------------------------------------------------------------------"""
# Path to the clean EEG .mat files
EEG_path = "data"
# Listing of the .mat files
EEG = list()
for elt in os.listdir(EEG_path):
if os.path.isfile(os.path.join(EEG_path, elt)):
if '.mat' in elt[len(elt)-4:]:
EEG.append(elt)
# Spectrum used
spectrum = ['all', (1,45), (8,12)]
nb_features = 3
"""----------------------------------------------------------------------------
------------------------------ Functions --------------------------------------
----------------------------------------------------------------------------"""
# Function on 1 channel
# Input: All points from one channel, for one epoch
def filter(x, n, fs, fc1, fc2):
b, a = signal.butter(n, [fc1/(fs/2), fc2/(fs/2)], 'bandpass')
y = signal.filtfilt(b, a, x)
return y
def haming(x, L):
# Symetric L-points hamming window
window = signal.hamming(L)
y = x * window.T # Element wise multiplication
return y
# Function on one epoch
# Input is a matrix of size (channel * length)
def amp_mean(x):
size = x.shape
y = list()
for i in range(size[0]):
y.append(np.mean(x[i,:]))
return y
def amp_max(x):
size = x.shape
y = list()
for i in range(size[0]):
y.append(np.max(abs(x[i,:])))
return y
"""----------------------------------------------------------------------------
-------------------------------- Script ---------------------------------------
----------------------------------------------------------------------------"""
# Load data
s_EEG = "{}/{}".format(EEG_path, EEG[4])
data = sio.loadmat(s_EEG)['s_EEG']['data'][0][0].astype(float) # data[i, j ,k]
labels = sio.loadmat(s_EEG)['s_EEG']['labels'][0][0][0] # labels[k]
fs = sio.loadmat(s_EEG)['s_EEG']['sampling_rate'][0][0][0][0] # 500 Hz
size = data.shape
# Creates an empty data frame of size (epoch * features)
df = np.empty(shape = (size[2], nb_features * len(spectrum)))
# Filling the dataframe with features
# for every epoch
for k in range(size[2]):
for freq in spectrum:
data_to_compute = np.empty(shape = size, dtype = float)
# Apply hamming
if freq == 'all':
for i in range(size[0]):
data_to_compute[i,:,k] = haming(data[i,:,k], size[1])
# Apply hamming after filtering
else:
for i in range(size[0]):
data_to_compute[i,:,k] = haming(filter(data[i,:,k],
15, fs, freq[0], freq[1]), size[1])
# data_to_compute is ready to have feature extracted
for n in range(0, df.shape[1], nb_features):
df[k, n] = data_to_compute[:,:,k]
df[k, n+1] = amp_mean(data_to_compute[:,:,k])
df[k, n+2] = amp_max(data_to_compute[:,:,k])
# X signal / Y label
X_train, X_test, Y_train, Y_test = train_test_split(data,
list(labels),
test_size=0.15,
random_state=42)
clf = SVC()
clf.fit(X_train, Y_train)
Variable type:
Thanks !

Categories