using spacy, I convert my training sets into matrices containing the individual word vectors. This is done in the encode_sentences() function. Furthermore, I encode the corresponding labels using the label_encoding() function. These data are now to be used as training data for my model. As soon as I want to predict a single sentence to get the right label, I get an nparray as output. How can I make a correct prediction?
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sqlite3
from sklearn.preprocessing import LabelEncoder
import spacy
from sklearn.model_selection import train_test_split
nlp = spacy.load('en_core_web_lg')
embedding_dim = nlp.vocab.vectors_length
def read_database(path):
# Loading Data from database
connection = sqlite3.connect(path)
db_rows = pd.read_sql('''select intents.intent, patterns.pattern
from intents, patterns where intents.id = patterns.intentid''', connection)
labels = []
sentences = []
intents = []
for i in range(len(db_rows)):
labels.append(db_rows["intent"][i])
if db_rows["intent"][i] not in intents:
intents.append(db_rows["intent"][i])
sentences.append(db_rows["pattern"][i])
return sentences, labels, intents
def label_encoding(labels):
# Calculate the length of labels
n_labels = len(labels)
print('Number of labels :-', n_labels)
le = LabelEncoder()
y = le.fit_transform(labels)
print('Length of y :- ', y.shape)
return y
def encode_sentences(sentences):
# Calculate number of sentences
n_sentences = len(sentences)
X = np.zeros((n_sentences, embedding_dim))
# y = np.zeros((n_sentences, embedding_dim))
# Iterate over the sentences
for idx, sentence in enumerate(sentences):
doc = nlp(sentence)
# Save the document's .vector attribute to the corresponding row in
X[idx, :] = doc.vector
return X
sentences_train, labels_train, all_intents = read_database('./database_x.db')
sentences_train = encode_sentences(sentences_train)
labels_train = label_encoding(labels_train)
x_train, x_test, y_train, y_test = train_test_split(sentences_train, labels_train, test_size=0.2)
model = keras.Sequential([keras.layers.Dense(16, activation='relu'),
keras.layers.Dense(16, activation='relu'),
keras.layers.Dense(len(all_intents), activation='softmax')])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=16, epochs=100)
prediction = model.predict(encode_sentences("how can i test rf heating"))
print("\n\n\n")
print(prediction)
argmax of predictions gives you the index of the best label candidate.
pred_labels_idx = np.argmax(predictions,-1)
The best practice is to make the LabelEncoder le accessible from outside of the function scope so you can inverse_transform the predictions to actually labels:
pred_labels = le.inverse_transform(pred_labels_idx)
Perhaps, you can modify your label_encoder so it also returns a label_decoder when it encodes the labels on training data.
Related
I'm very new to programming and machine learning but I've been trying to create a prediction model to tag product reviews. I found the following model:
import numpy as np
import pandas as pd
# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder
dataset = pd.read_csv('dataset.csv')
def normalize_text(s):
s = s.lower()
# remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
s = re.sub('\s\W',' ',s)
s = re.sub('\W\s',' ',s)
# make sure we didn't introduce any double spaces
s = re.sub('\s+',' ',s)
return s
dataset['TEXT'] = [normalize_text(s) for s in dataset['texto']]
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(dataset['TEXT'])
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['codigo'])
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_predicted = nb.predict(x_test)
So far so good. But then, I tried to use that trained model to predict another set of data like this:
#new data
test = pd.read_csv('testset.csv')
test['TEXT'] = [normalize_text(s) for s in test['respostas']]
# pull the data into vectors
vectorizer = CountVectorizer()
classes = vectorizer.fit_transform(test['TEXT'])
classificacao = nb.predict(classes)
However, I got a "ValueError: dimension mismatch"
I'm not sure how to do this second step, which is using the model to predict the category of a fresh data set.
Thanks in advance for your assistance.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('data/TrainingData_unsubscribe.csv')
data['labels'] = data['Category'].factorize()[0]
#vectorize the features
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', stop_words='english')
x_vectors = tfidf.fit_transform(data.msgContent)
#split the data
x = x_vectors
y = data.labels
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_train = x_train.toarray()
x_train.shape
x.shape
x_test = x_test.toarray()
preds = x_vectors.toarray()
#Random seed and callback
stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
#create the model
model = tf.keras.Sequential([
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(len(pd.unique(y)), activation='softmax')
])
#compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(),
metrics=['accuracy'])
#fit the model
model.fit(x_train,y_train, epochs=500, verbose=0, callbacks=[stop])
#elavuation
print('\nEvaluation: ')
model.evaluate(x_test,y_test)
predictions = model.predict(preds)
len(pd.unique(y))
data["Prediction"] = predictions.argmax(axis=1)
output = data.drop(["labels"], axis=1)
category_ids = data[["Category", "labels"]].drop_duplicates()
output['Prediction'] =
Originally converted string labels to numeric ones using factorize():
data['labels'] = data['Category'].factorize()[0]
Now I'm trying to convert the labels back to their initial string variables. I've created a DF with the mapped values
Category
labels
HardBounce
0
SoftBounce
1
etc...
is it possible to map a df column using another df as a reference for the map?
I've been unable to find any docs that show how to do this.
I was using multi class U-Net segmentation where I am encountering value error while training by data model. My multi class model is divided into 4 classes.
Code for training model:
from simple_multi_unet_model import multi_unet_model #Uses softmax
from tensorflow.keras.utils import normalize
import os
import glob
import cv2
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
#Resizing images, if needed
SIZE_X = 128
SIZE_Y = 128
n_classes=4 #Number of classes for segmentation
#Capture training image info as a list
train_images = []
directory_path = '/home/Documents/Multiclass/images/'
list_of_files = sorted( filter( os.path.isfile, glob.glob(directory_path + '*.png', recursive=True) ) )
for img_path in list_of_files:
img = cv2.imread(img_path, 0)
img = cv2.resize(img, (SIZE_Y, SIZE_X))
train_images.append(img)
#Convert list to array for machine learning processing
train_images = np.array(train_images)
#Capture mask/label info as a list
train_masks = []
labels_path = '/home/Documents/Multiclass/labels/'
list_of_labels = sorted( filter( os.path.isfile, glob.glob(labels_path + '*.png', recursive=True) ) )
for mask_path in list_of_labels:
mask = cv2.imread(mask_path, 0)
mask = cv2.resize(mask, (SIZE_Y, SIZE_X), interpolation = cv2.INTER_NEAREST) #Otherwise ground truth changes due to interpolation
train_masks.append(mask)
#Convert list to array for machine learning processing
train_masks = np.array(train_masks)
###############################################
#Encode labels... but multi dim array so need to flatten, encode and reshape
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
n, h, w = train_masks.shape
train_masks_reshaped = train_masks.reshape(-1,1)
train_masks_reshaped_encoded = labelencoder.fit_transform(train_masks_reshaped)
train_masks_encoded_original_shape = train_masks_reshaped_encoded.reshape(n, h, w)
np.unique(train_masks_encoded_original_shape)
#################################################
train_images = np.expand_dims(train_images, axis=3)
train_images = normalize(train_images, axis=1)
train_masks_input = np.expand_dims(train_masks_encoded_original_shape, axis=3)
#Create a subset of data for quick testing
#Picking 10% for testing and remaining for training
from sklearn.model_selection import train_test_split
x_train, X_test, y_train, y_test = train_test_split(train_images, train_masks_input, test_size = 0.10, random_state = 0)
print("Class values in the dataset are ... ", np.unique(y_train)) # 0 is the background/few unlabeled
from tensorflow.keras.utils import to_categorical
train_masks_cat = to_categorical(y_train, num_classes=n_classes)
y_train_cat = train_masks_cat.reshape((y_train.shape[0], y_train.shape[1], y_train.shape[2], n_classes))
test_masks_cat = to_categorical(y_test, num_classes=n_classes)
y_test_cat = test_masks_cat.reshape((y_test.shape[0], y_test.shape[1], y_test.shape[2], n_classes))
###############################################################
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
np.unique(train_masks_reshaped_encoded),
train_masks_reshaped_encoded)
print("Class weights are...:", class_weights)
IMG_HEIGHT = x_train.shape[1]
IMG_WIDTH = x_train.shape[2]
IMG_CHANNELS = x_train.shape[3]
def get_model():
return multi_unet_model(n_classes=n_classes, IMG_HEIGHT=IMG_HEIGHT, IMG_WIDTH=IMG_WIDTH, IMG_CHANNELS=IMG_CHANNELS)
model = get_model()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
#If starting with pre-trained weights.
#model.load_weights('???.hdf5')
history = model.fit(x_train, y_train_cat,
batch_size = 16,
verbose=1,
epochs=100,
validation_data=(X_test, y_test_cat),
class_weight=class_weights,
shuffle=False)
I used following approach to define class weights:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
np.unique(train_masks_reshaped_encoded),
train_masks_reshaped_encoded)
print("Class weights are...:", class_weights)
The result is class_weights : 0.276965 ,13.5112 ,5.80929,6.97915.
I am encountering ValueError when I train my model. How can I possibly resolve it? Please suggest a better approach of using class weights if you think my approach is not viable.
File "/home/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 1185, in _configure_dataset_and_inferred_steps
if class_weight:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I had the same problem but solved it with this!!!
You have to zip it together as a dictionary
Try The below code:
from sklearn.utils import class_weight
class_weights = dict(zip(np.unique(train_masks_reshaped_encoded), class_weight.compute_class_weight('balanced', np.unique(train_masks_reshaped_encoded),
train_masks_reshaped_encoded)))
My understanding of this error message is that numpy does not know whether to evaluate an array as True if any element is true, or as True only if all elements are true. Hence, a ValueError is returned because the boolean evaluation is ambiguous in this regard.
Therefore, when evaluating an array, you should use a.any() or a.all(), as indicated in the error message attached.
The error is likely to be occurring from somewhere else in your code (not shared?) when you try to evaluate the class weights in a boolean context.
The image of the dataset
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
Loading the data set using pandas as data frame format
import pandas as pd
df = pd.read_csv(r"E:\50_Startups.csv")
df.drop(['State'],axis = 1, inplace = True)
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
df.iloc[:,:] = mm.fit_transform(df.iloc[:,:])
info = df.describe()
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split( x,y, test_size=0.2, random_state=42)
Initializing the model
model = Sequential()
model.add(Dense(40,input_dim =3,activation="relu",kernel_initializer='he_normal'))
model.add(Dense(30,activation="relu"))
model.add(Dense(1))
model.compile(loss="mean_squared_error",optimizer="adam",metrics=["accuracy"])
fitting model on train data
model.fit(x=x_train,y=y_train,epochs=150, batch_size=32,verbose=1)
Evaluating the model on test data
eval_score_test = model.evaluate(x_test,y_test,verbose = 1)
I am getting zero accuracy.
The problem is that accuracy is a metric for discrete values (classification).
you should use:
r2 score
mape
smape
instead.
e.g:
model.compile(loss="mean_squared_error",optimizer="adam",metrics=["mean_absolute_percentage_error"])
Adding to the answer of #GuintherKovalski accuracy is not for regression but if you still want to use it then you can use it along with some threshold using following steps:
Set a threshold such that if the absolute difference in the predicted value and the actual value is less than equal to the threshold then you consider that value as correct, otherwise false.
Ex -> predicted values = [0.3, 0.7, 0.8, 0.2], original values = [0.2, 0.8, 0.5, 0.4].
Now abs diff -> [0.1, 0.1, 0.3, 0.2] and let's take a threshold of 0.2. So with this threshold the correct -> [1, 1, 0, 1] and your accuracy will be correct.sum()/len(correct) and that is 3/4 -> 0.75.
This could be implemented in TensorFlow like this
import numpy as np
import tensorflow as tf
from sklearn.datasets import make_regression
data = make_regression(10000)
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(100,))])
def custom_metric(a, b):
threshold = 1 # Choose accordingly
abs_diff = tf.abs(b - a)
correct = abs_diff >= threshold
correct = tf.cast(correct, dtype=tf.float16)
res = tf.math.reduce_mean(correct)
return res
model.compile('adam', 'mae', metrics=[custom_metric])
model.fit(data[0], data[1], epochs=30, batch_size=32)
Just want to say Thank you to everyone who took their precious time to help me. I am posting this code as this worked for me. I hope it helps everyone who is stuck somewhere looking for answers. I got this code after consulting with my friend.
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
import pandas as pd
from sklearn.model_selection import train_test_split
# Loading the data set using pandas as data frame format
startups = pd.read_csv(r"E:\0Assignments\DL_assign\50_Startups.csv")
startups = startups.drop("State", axis =1)
train, test = train_test_split(startups, test_size = 0.2)
x_train = train.iloc[:,0:3].values.astype("float32")
x_test = test.iloc[:,0:3].values.astype("float32")
y_train = train.Profit.values.astype("float32")
y_test = test.Profit.values.astype("float32")
def norm_func(i):
x = ((i-i.min())/(i.max()-i.min()))
return (x)
x_train = norm_func(x_train)
x_test = norm_func(x_test)
y_train = norm_func(y_train)
y_test = norm_func(y_test)
# one hot encoding outputs for both train and test data sets
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
# Storing the number of classes into the variable num_of_classes
num_of_classes = y_test.shape[1]
x_train.shape
y_train.shape
x_test.shape
y_test.shape
# Creating a user defined function to return the model for which we are
# giving the input to train the ANN mode
def design_mlp():
# Initializing the model
model = Sequential()
model.add(Dense(500,input_dim =3,activation="relu"))
model.add(Dense(200,activation="tanh"))
model.add(Dense(100,activation="tanh"))
model.add(Dense(50,activation="tanh"))
model.add(Dense(num_of_classes,activation="linear"))
model.compile(loss="mean_squared_error",optimizer="adam",metrics =
["accuracy"])
return model
# building a cnn model using train data set and validating on test data set
model = design_mlp()
# fitting model on train data
model.fit(x=x_train,y=y_train,batch_size=100,epochs=10)
# Evaluating the model on test data
eval_score_test = model.evaluate(x_test,y_test,verbose = 1)
print ("Accuracy: %.3f%%" %(eval_score_test[1]*100))
# accuracy score on train data
eval_score_train = model.evaluate(x_train,y_train,verbose=0)
print ("Accuracy: %.3f%%" %(eval_score_train[1]*100))
I am working on a sample data set from a link below.
https://www.kaggle.com/enirtium/gender-voice/data
I am trying to open .csv file(maybe I am opening it wrongly) and trying to create fully connected neural layers. Then, I am trying to train them but unfortunately, I am getting input shape not fitting problem.
"ValueError: Error when checking input: expected dense_1_input to have shape (None, 2800) but got array with shape (3168, 1)"
My codes like these:
import csv
import numpy
import string
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import numpy as np
from keras import models
from keras import layers
path = r'/Users/username/Desktop/voice.csv'
meanfreq = []
sd = []
median = []
label = []
with open(path, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader)
for line in csv_reader:
#print(line['meanfreq'])
meanfreq.append(line[0])
sd.append(line[1])
median.append(line[2])
if line[20] == "female":
label.append(1)
else:
label.append(0)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(2800,)))
network.add(layers.Dense(1, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(meanfreq, label, epochs=5, batch_size=128)
scores = network.evaluate(meanfreq, label)
print("\n%s: %.2f%%" % (network.metrics_names[1], scores[1]*100))
I suppose that maybe, I can't open .csv file (it is opening "list" primitive) or there are any other problems. I am unfortunately fresh man at neural networks and python. I will open this csv file and will use its %70 data to train, %30 data for testing.
Yes,
It works as these;
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
network.add(layers.Dense(2, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(X_train, y_train, epochs=100, batch_size=128)
network.evaluate(X_test, y_test)
Reading in the data seems to be fine.
I Imagine you have a data set that looks like:
mean_freq, label
.12 0
.45 1
And you want to train a classifier. Currently the model is expecting
a training example to have 2800 features. input shape=(2800,) but you only want 1 feature: the mean_freq
The mistake here is that you are trying to tell Keras how much training examples to use while declaring the model. You don't do that here, you'll do that later when you're fitting the model.
So the input_shape to keras's Dense Layer should be (1, ) for the single feature. If you're going to use mean and median freq then you would want two features (2, ) and so on.
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(1,)))
And you can split your training and test sets in multiple ways. My suggestion is to do something like this:
train_size = 2800
X_train = mean_freq[:train_size]
y_train = label[:train_size]
X_test = mean_freq[train_size:]
y_test = label[:train_size]
Then fit the model with the training set and score with the test set.
network.fit(X_train, y_train, epochs=5, batch_size=128)
scores = network.evaluate(X_test, y_test)
Edit to reflect comments:
well if the case is that you training data has 20 features then
you tell keras that with:
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
You have do the work necessary to get the data in to the shape you need for training and testing but the template above is how you would fit and evaluate the model.
I would also note that there are better ways read in csv data if you're going to do modeling (as you are). Look at using a pandas dataframe.
Also better (more standard ways) of creating train and test split: look into sklearn's train_test_split
Edit 2: A quick model of the voice data
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.model import Model
from keras.layers import Dense, Input
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
# model using keras functional style
inp = Input(shape =(20, ))
dense = Dense(128)(inp)
out = Dense(2, activation='sigmoid')(dense)
model = Model(inputs=[inp], outputs=[out])
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=128)
model.evaluate(X_test, y_test)