How to add data via directories for training images - python

I have been going through git repository by flyyufelix "https://github.com/flyyufelix/cnn_finetune" to fine tune an inception v3 network I want to train network to detect a disease so I have 2 set of images one with disease and without disease.
The git says X_train, Y_train, X_valid, Y_valid = load_data() he loads the cifar dataset ,The git asks us to create our own load_data() function.The author has the code as below
import cv2
import numpy as np
from keras.datasets import cifar10
from keras import backend as K
from keras.utils import np_utils
nb_train_samples = 3000 # 3000 training samples
nb_valid_samples = 100 # 100 validation samples
num_classes = 10
def load_cifar10_data(img_rows, img_cols):
# Load cifar10 training and validation sets
(X_train, Y_train), (X_valid, Y_valid) = cifar10.load_data()
# Resize trainging images
if K.image_dim_ordering() == 'th':
X_train = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_train[:nb_train_samples,:,:,:]])
X_valid = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_valid[:nb_valid_samples,:,:,:]])
else:
X_train = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_train[:nb_train_samples,:,:,:]])
X_valid = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_valid[:nb_valid_samples,:,:,:]])
# Transform targets to keras compatible format
Y_train = np_utils.to_categorical(Y_train[:nb_train_samples], num_classes)
Y_valid = np_utils.to_categorical(Y_valid[:nb_valid_samples],num_classes)
return X_train, Y_train, X_valid, Y_valid
can i know how to generate a function which loads
data X_train, Y_train, X_valid, Y_valid = load_data() when i have directries in pc

Use Keras' ImageDataGenerator() class and call flow_from_directory() on it. The labels will be automatically inferred from the directory names. So if you have a directory titled "disease," then Keras would infer that all images within that directory are labeled as "disease," and the same thing would be true for another directory titled "no disease," for example.
I demonstrate how to prepare image data for training a CNN in Keras in this video. The first half of the video is about image organization on disk, and then the second half goes through the process described above.

Follow this tutorial once and all your doubts will get cleared.
https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

Related

I have the same number of files, and still having different shape on them, ANN machine learning

I am trying to create neural network with python, it is kind of ANN network to use in classification problem. The purpose of the neural network is to classify who is speaking, whether it is me or someone else. I have the data in 2 folders.
folders image
one is called me, they are audios of me speaking, and another is called other, audios of other people speaking.
View of the wav files(audio data)
The problem is that it cannot train the network because the data is not the same length, and if it does!, there are 18 in each folder, not one more, not one less.
When I do
print(X.shape)
print(y.shape)
gives this.
Result of X, y shapes
Is not the same shape even there are 18 audio files on each folder
model.py
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
from scipy.io import wavfile
from pathlib import Path
import os
### DATASET
pathlist = Path(os.path.abspath('Voiceclassification/Data/me/')).rglob('*.wav')
# My voice data
for path in pathlist:
filename = str(path)
# convert audio to numpy array and then 2D to 1D np Array
samplerate, data = wavfile.read(filename)
#print(f"sample rate: {samplerate}")
data = data.flatten()
#print(f"data: {data}")
pathlist2 = Path(os.path.abspath('Voiceclassification/Data/other/')).rglob('*.wav')
# other voice data
for path2 in pathlist2:
filename2 = str(path2)
samplerate2, data2 = wavfile.read(filename2)
data2 = data2.flatten()
#print(data2)
### ADAPTING THE DATA FOR THE MODEL
X = data # My voice
y = data2 # Other data
#print(X.shape)
#print(y.shape)
### Trainig the model
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
# Performing future scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
### Creating the ANN
ann = tf.keras.models.Sequential()
# First hidden layer of the ann
ann.add(tf.keras.layers.Dense(units=6, activation="relu"))
# Second one
ann.add(tf.keras.layers.Dense(units=6, activation="relu"))
# Output layer
ann.add(tf.keras.layers.Dense(units=6, activation="sigmoid"))
# Compile our neural network
ann.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=['accuracy'])
# Fit ANN
ann.fit(x_train, y_train, batch_size=32, epochs=100)
ann.save('train_model.model')
Any idea?
Is because your wav audio files maybe have different sizes, they can be 10 seconds all, but if millisecond are different, that will affect your data shape, what you can do is trim your wav files so all of them are 10.00sec with no millisenconds

How to convert a folder of images into X and Y batches with Keras?

Say I have a folder of images such as:
PetData
|
Dog - images
|
Cat - images
How would I transform it into (x_train, y_train),(x_test, y_test) format? I see this format used extensively with the MNIST dataset which goes like:
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
However i'd like to do this with my own folder of images.
mnist.load_data() returns two tuples with the content of the images and the labels in uint8 arrays. You should get those arrays by loading the images of your folders (you can use modules such as PIL.Image in order to load X, your y is just the set labels provided by the folder name).
PIL.Image use example:
from PIL import Image
import glob
for infile in glob.glob("*.jpg"):
im = Image.open(infile)
To split train/test you can use sklearn.model_selection.train_test_split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
Suppose your train or test images are in folder PetData each class in separate folder as Dog and Cat. You can use ImageDataGenerator to prepare your train/test data as below:
from keras import layers
from keras import models
model = models.Sequential()
#define your model
#..........
#......
#Using ImageDataGenerator to read images from directories
from keras.preprocessing.image import ImageDataGenerator
train_dir = "PetData/"
#PetData/Dog/ : dog images
#PetData/Cat/ : cat images
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory( train_dir, target_size=(150, 150), batch_size=20)
history = model.fit_generator( train_generator, steps_per_epoch=100, epochs=30) #fit the model using train_generator
Hope this helps!
If you want to import images from a folder in your computer you can import images 1 by 1 from the folder in insert the in a list.
Your folder format is as you have shown:
PetData
|
Dog - images
|
Cat - images
Assume path is a variable storing the address of PetData folder. We will use OpenCV to import images but you can use other libraries as well.
data = []
label = []
Files = ['Dog', 'Cat']
label_val = 0
for files in Files:
cpath = os.path.join(path, files)
cpath = os.path.join(cpath, 'images')
for img in os.listdir(cpath):
image_array = cv2.imread(os.path.join(cpath, img), cv2.IMREAD_COLOR)
data.append(image_array)
label.append(label_val)
label_val = 1
Convert the list to a numpy array.
data = np.asarray(data)
label = np.asarray(label)
After importing the images you can use train_test_split to split the data for training and testing.
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=42)

Validation set with TensorFlow Dataset

From Train and evaluate with Keras:
The argument validation_split (generating a holdout set from the training data) is not supported when training from Dataset objects, since this features requires the ability to index the samples of the datasets, which is not possible in general with the Dataset API.
Is there a workaround? How can I still use a validation set with TF datasets?
No, you can't use use validation_split (as described clearly by documentation), but you can create validation_data instead and create Dataset "manually".
You can see an example in the same tensorflow tutorial:
# Prepare the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
# Prepare the validation dataset
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(64)
model.fit(train_dataset, epochs=3, validation_data=val_dataset)
You could create those two datasets from numpy arrays ((x_train, y_train) and (x_val, y_val)) using simple slicing as shown there:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
There are also other ways to create tf.data.Dataset objects, see tf.data.Dataset documentation and related tutorials/notebooks.

Getting gradient of a Keras model output w.r.t input, but with the last layer being an SVM

I have a CNN model built in Keras. I then took out its last layer as a feature and retrained an SVM with it.
Is it possible to now find the gradient of the SVMs output wrt the CNN model's input?
I know of this method (Getting gradient of model output w.r.t weights using Keras) and am able to use it to get the gradient wrt input for the layer i am pulling the features out of. I can also get the numerical gradient of the SVM wrt to its input, albeit at the moment its a bit of a mess. Would appreciate some input here as well actually.
But now I need to somehow combine these two to get the gradient of the SVM to the input of the entire CNN model.
"""
Main CNN script
"""
# Imports ##
# general
import matplotlib.pyplot as plt
import numpy as np
# ML libraries
from tensorflow.keras.datasets import mnist
# ML utilities
from tensorflow.keras.utils import to_categorical
# Python scripts used
import train_CNN
import load_CNN
import train_subSVMs
import load_subSVMs
import train_finalSVM
import load_finalSVM
import joblib
def save_array(array, name):
joblib.dump(array, name+'.pkl', compress = 3)
return
def load_array(array, name):
array = joblib.load(array, name)
return array
def show_data_example(i, dataset):
# show some of the images in the dataset
# call multiple times for multiple images
# squeeze is necessary here to get rid of the extra dimension introduced in rehsaping
print('\nExample Image: %s from selected dataset' %i)
plt.imshow(np.squeeze(dataset[i]), cmap=plt.get_cmap('gray'))
plt.show()
return
def load_and_encode(target_shape):
# load dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, y_train = X_train[:,:,:],y_train[:]
X_test, y_test = X_test[:,:,:], y_test[:]
print('Loaded Mnist dataset')
print('Train: X=%s, y=%s' % (X_train.shape, y_train.shape))
print('Test: X=%s, y=%s' % (X_test.shape, y_test.shape))
# encode y data
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# normalise X data (X/255 -> [0,1])
X_train = X_train/255.0
X_test = X_test/255.0
# currently dimensions are (m x 28 x 28)
# making them into (m x 28x28x1) 3Dimensional for convolution networks
X_train = X_train.reshape(X_train.shape[0], target_shape[0], target_shape[1], target_shape[2])
X_test = X_test.reshape(X_test.shape[0], target_shape[0], target_shape[1], target_shape[2])
# show an arbitary example image from training set
show_data_example(12, X_train)
return X_train, y_train, X_test, y_test
image_shape = (28,28,1)
# load and encode mnist data
X_train, y_train, X_test, y_test = load_and_encode(image_shape)
# hyper-parameters
learning_rate = 0.1
momentum = 0.9
dropout = 0.5
batch_size = 128
epochs = 50
decay = 1e-6
number_of_classes = 10
# store required data into a packet to send to various imports
packet = [learning_rate, momentum, dropout, batch_size, epochs, decay,
number_of_classes, image_shape,
X_train, y_train, X_test, y_test]
data = [X_train, y_train, X_test, y_test]
#CNN_model = train_CNN.train_model(packet, save_model = 'True')
CNN_model = load_CNN.load_model(packet) # keras sequential model
#subSVM1, subSVM2, subSVM3, features = train_subSVMs.train(CNN_model, data, c=0.1, save_model = 'True', get_accuracies= 'True')
subSVM1, subSVM2, subSVM3, features = load_subSVMs.load(CNN_model, data, c=0.1, get_accuracies='False')
subSVMs = [subSVM1, subSVM2, subSVM3]
feature1_train, feature1_test,\
feature2_train, feature2_test,\
feature3_train, feature3_test = features
final_SVM = joblib.load('saved_finalSVM.pkl') # sklearn svm trained from features
NUMBER = 48
plt.imshow(np.squeeze(X_train[NUMBER,:,:,:]), cmap=plt.get_cmap('binary'))
# gradients of features wrt to input
import tensorflow.keras.backend as K
gradients = K.gradients(CNN_model.get_layer(name='feature1').output, CNN_model.input) # K.gradients(y,x) for dy/dx
f = K.function([CNN_model.input], gradients)
x = np.expand_dims(X_train[NUMBER,:,:,:],axis=0)
a=f([x])

Improving the accuracy of image classification model

My aim is to build an image classification model for flowers. The data RAR file consists of a folder named train data which consists of about 16000 images labelled from 0- 16000. Similarly there is a folder for test data also.
Apart from this there are two csv workbooks. The first csv workbook consists of two attributes - label & flower class. There are 104 labels & flower classes. The second workbook also consists of two attributes - id & flower class. This csv is the link between the train images folder & flower classes. ID is the linking attribute. I.e for eg assume that image labelled 10 in train images folder is the image of a sunflower. Hence in the csv workbook the flower class entry corresponding to id = 10 is a sunflower.
For eg assume that image labelled 10 in train data folder is a sunflower. Hence in the (second) workbook the flower class entry corresponding to id =10 is a sunflower.
This is my code
# Import relavant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from PIL import Image
# Load the csv files
# Workbook no.1
label_csv = pd.read_csv('/content/flowers_label.csv')
# Workbook no.2
train = pd.read_csv('/content/flowers_idx.csv')
# To sort the train csv id wise from 0 - 16464
train.sort_values('id')
# Creating inputs and targets
X = [] #images
y = [] # labels
base = "/content/flower_tpu/flower_tpu/flowers_google/flowers_google//"
row = 0;
for idx in range(len(train)):
# get the flower row
flower = train.iloc[idx]
# create flower path
path = f"{base}{flower.id}.jpeg"
#load image
img = Image.open(path)
# convert to numpy
img = np.array(img)
#save to X
X.append(img)
# get label
label = label_csv[label_csv['flower_class'] == flower.flower_cls].label.values[0]
# save to y
y.append(label)
# Train Validation split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, random_state=12, test_size=0.2)
# The model
output_size = 104
hidden_layer_size = 150
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(224, 224, 3)),
tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
tf.keras.layers.Dense(output_size, activation='softmax')
])
# Converting all data into ndarrays
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_validation = np.asarray(X_validation)
y_validation = np.asarray(y_validation)
# Compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Fitting
model.fit(X_train, y_train, epochs=3, validation_data=(X_validation, y_validation), validation_steps=10, verbose =2)
I code is running but the train & validation accuracy is as poor as 6%. :/
How can I improve this code?

Categories