I'm trying to train a text classifier but keep getting a data cardinality error. I have checked previous posts to this same question, but I still can't work out where I am going wrong and will appreciate any suggestions. My code is as follows below:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import LSTM, Dropout, Activation
def define_model(vocab_size, max_length):
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss= 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
header = ['Text', 'Incident']
dataset ='/Data/CSV files/Preprocessed_Tweets&Label_Dataset.csv'
df = pd.read_csv(dataset, names=header)
features = ['Text']
X = df.loc[:, features]
y = df.loc[:, ['Incident']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size= .80)
print(X_train.shape)
print(y_train.shape)
vocab_size = 50000
max_length = 50
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=50)
model = define_model(vocab_size, max_length)
model.fit(data, y_train, epochs=10, verbose=2)
The code seem to fail when I try to fit the model, however the shape of X_train and y_train is same before the data is passed to the network. Below is the output and error:
(39958, 1)
(39958, 1)
ValueError: Data cardinality is ambiguous:
x sizes: 1
y sizes: 39958
Make sure all arrays contain the same number of samples.
Related
I am encountering an issue for a school project.
I have to predict on a test set, based on textual data, the age and gender of a person. My training dataset has 4 features (ID, keywords, age, sex).
I created a neural network (please see the code below) but when fitting the latter, my loss values are extremely negative.
Could you please tell me how to alleviate this issue?
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
#Load the datasets
chunk_train = pd.read_csv('/Users/romeoleon/Downloads/train.csv',chunksize=10**6)
data_train = pd.concat(chunk_train)
#Map the values for sex columns
data_train.sex = data_train.sex.map({'M':0,'F':1})
#Remove the rows with missing data
print('Missing rows represent {} percent of the dataframe'.format(data_train['keywords'].isna().sum()/len(data_train.keywords)*100))
#Drop the missing values
data_train.dropna(inplace=True)
#Plot the distribution of numerical variables
sns.histplot(data_train.age,bins=85)
plt.show()
sns.countplot(x='sex',data=data_train)
plt.show()
#Prepare the data to feed it to the NN
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
x_train, x_test, y_train, y_test = train_test_split(data_train['keywords'],data_train[["age","sex"]],test_size=0.2)
#Choose parameters
vocab_size = 1000
maxlen = 300
batch_size = 32
embedding_dims = 100
hidden_dims = 5
filters = 250
kernel_size = 3
epochs = 10
#Tokenize the words
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train)
X_train = tokenizer.texts_to_matrix(x_train)
X_test = tokenizer.texts_to_matrix(x_test)
#Pad sequencing : Ensure all sequences have the same length
from tensorflow.keras.preprocessing import sequence
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, MaxPooling1D
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.layers import Conv1D, Flatten
#Create the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen, trainable=True))
model.add(Dropout(0.5))
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu'))
#model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))
# Compile neural model
model.compile(loss='binary_crossentropy', # Cross-entropy
optimizer='adam', # Root Mean Square Propagation
metrics=['accuracy']) # Accuracy performance metric
model.summary()
#Fit the model
model.fit(X_train, y_train,
batch_size=batch_size,
epochs=1,
validation_data=(X_test, y_test), verbose=1)
You can find below a screenshot of the structure of my training dataset:
When using 'binary_crossentropy' as the loss function, dense at the output end should have only 1 unit rather than 2. (1 unit have 2 states, which is 1 or 0)
Using this instead:
model.add(Dense(1, activation='sigmoid'))
I have written the following code for a neural network to perform regression on a dataset, but I am getting a ValueError. I have looked up to different answers and they suggested to use df = df.values to get a numpy array. I tried it but it still produced the same error. How to fix this?
CODE
from keras import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
#Define Features and Label
features = ['posted_by', 'under_construction', 'rera', 'bhk_no.', 'bhk_or_rk',
'square_ft', 'ready_to_move', 'resale', 'longitude',
'latitude']
X=train[features].values
y=train['target(price_in_lacs)'].values
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 23, shuffle = True)
#Model
model = Sequential()
model.add(Dense(10, activation='relu', kernel_initializer='random_normal', input_dim = 10))
model.add(Dense(1, activation = 'relu', kernel_initializer='random_normal'))
#Compiling the neural network
model.compile(optimizer = Adam(learning_rate=0.1) ,loss='mean_squared_logarithmic_error', metrics =['mse'])
#Fitting the data to the training dataset
model.fit(X_train,y_train, batch_size=256, epochs=100, verbose=0)
ERROR
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).
I have data with about 3600 rows and 27 columns. In one of these columns is a label from 1 to 10 which I want to predict from the rest.
Model from scratch:
import tensorflow as tf
sess = tf.Session()
import keras
import pandas
import sklearn
import matplotlib
import pandas as pd
df = pd.read_csv('data.csv')
dataset = df.values
X = dataset[:,0:27]
Y = dataset[:, 8] ///I want column 8 to be my label column
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
from keras.models import Sequential
from keras.layers import Dense
model = Sequential([
Dense(32, activation='relu', input_shape=(27,)),
Dense(32, activation='relu'),
Dense(1, activation='softmax'),])
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
Ytest = keras.utils.to_categorical(Y_train,)
print('The one hot label is:', Y_train[5])
hist = model.fit(X_train, Ytest,
batch_size=32, epochs=20,
validation_data=(X_val, Y_val))
Error:
ValueError: Error when checking target: expected dense_84 to have shape (1,) but got array with shape (11,)
I'm at a complete loss at what's wrong here. Could use a nudge in the right direction.
Two things:
1 -
It looks like you forgot to one hot encode your Y_train, the error states your last layer is expecting a tensor of shape [batch_size, 11].
or
2 -
Your last Dense layer should have 11 nodes not 1
so I'm making a project where basically i have to predict whether or not a house price is above or below its median price and to do that, I'm using this dataset from Kaggle(https://drive.google.com/file/d/1GfvKA0qznNVknghV4botnNxyH-KvODOC/view). 1 means "Above Median" and 0 means "Below Median". I wrote this code to train a neural network and save it as a .h5 file:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
import h5py
df = pd.read_csv('housepricedata.csv')
dataset = df.values
X = dataset[:,0:10]
Y = dataset[:,10]
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
model = Sequential([
Dense(32, activation='relu', input_shape=(10,)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid'),
])
model.compile(optimizer='sgd',
loss='binary_crossentropy',
metrics=['accuracy'])
hist = model.fit(X_train, Y_train,
batch_size=32, epochs=100,
validation_data=(X_val, Y_val))
model.save("house_price.h5")
After running it, it successfully saves the .h5 file to my directory. What I want to do now is use my trained model to make predictions on a new .csv file and determine whether or not each of those are above or below median price. This is an image of the csv file in VSCode that i want it to make predictions on:
csv file image As you can see, this file doesn't contain a 1(above median) or 0(below median) because that's what I want it to predict. This is the code I wrote to do that:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
import h5py
df = pd.read_csv('data.csv')
dataset = df.values
X = dataset[:,0:10]
Y = dataset[:,10]
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
model = load_model("house_price.h5")
y_pred = model.predict(X_test)
print(y_pred)
It's output is [[0.00101464]] I have no clue what that is and why it's only returning one value even though the csv file has 4 rows. Does anyone know how I can fix that and be able to predict either a 1 or a 0 for each row in the csv file?
Thank You!
As much I understand what you want!
Let's Try ! This code work for me
import tensorflow
model = tensorflow.keras.models.load_model("house_price.h5")
y_pred=model.predict(X_test)
still you are not able to get visit following site
1:answer1
2:answer2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('C:\\Users\\acer\\Downloads\\housepricedata.csv')
dataset.head()
X=dataset.iloc[:,0:10]
y=dataset.iloc[:,10]
X.head()
from sklearn.preprocessing import StandardScaler
obj=StandardScaler()
X=obj.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split
(X,y,random_state=2020,test_size=0.25)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation =
'relu', input_dim = 10))
# classifier.add(Dropout(p = 0.1))
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation
= 'relu'))
# classifier.add(Dropout(p = 0.1))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation
= 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics
= ['accuracy'])
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
print(y_pred)
classifier.save("house_price.h5")
import tensorflow
model = tensorflow.keras.models.load_model("house_price.h5")
y_pred=model.predict(X_test)
y_pred = (y_pred > 0.5)
print(y_pred)
Both y_pred produce same output for me
Here one thing you not y_pred not contain 0 and 1 because you use sigmoid function which determine predication in probability
so if(y_pred>0.5) it mean value is one
#True rep one
#false rep zero
#you can use replace function or map function of pandas to get convert true
into 1
I'm writing a code for doing a multiclass classification. I have custom datasets with 7 columns (6 features and 1 label), the training dataset has 2 types of label (1 and 2), and the testing dataset has 3 types of labels (1, 2, and 3). The aim of the model is to see how well the model predicting the label '3'.
As of now, I'm trying the MLP algorithm, the code is as follows:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from keras.models import load_model
from sklearn.externals import joblib
from joblib import dump, load
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#from keras.layers import Dense, Embedding, LSTM, GRU
#from keras.layers.embeddings import Embedding
#Load the test dataset
df1 = pd.read_csv("/home/user/Desktop/FinalTestSet.csv")
test = df1
le = LabelEncoder()
test['Average_packets_per_flow'] = le.fit_transform(test['Average_packets_per_flow'])
test['Average_PktSize_per_flow'] = le.fit_transform(test['Average_PktSize_per_flow'])
test['Avg_pkts_per_sec'] = le.fit_transform(test['Avg_pkts_per_sec'])
test['Avg_bytes_per_sec'] = le.fit_transform(test['Avg_bytes_per_sec'])
test['N_pkts_per_flow'] = le.fit_transform(test['N_pkts_per_flow'])
test['N_pkts_size_per_flow'] = le.fit_transform(test['N_pkts_size_per_flow'])
#Select the x and y columns from dataset
xtest_Val = test.iloc[:,0:6].values
Ytest = test.iloc[:,6].values
#print Ytest
#MinMax Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
Xtest = scaler.fit_transform(xtest_Val)
#print Xtest
#Load the train dataset
df2 = pd.read_csv("/home/user/Desktop/FinalTrainingSet.csv")
train = df2
le = LabelEncoder()
test['Average_packets_per_flow'] = le.fit_transform(test['Average_packets_per_flow'])
test['Average_PktSize_per_flow'] = le.fit_transform(test['Average_PktSize_per_flow'])
test['Avg_pkts_per_sec'] = le.fit_transform(test['Avg_pkts_per_sec'])
test['Avg_bytes_per_sec'] = le.fit_transform(test['Avg_bytes_per_sec'])
test['N_pkts_per_flow'] = le.fit_transform(test['N_pkts_per_flow'])
test['N_pkts_size_per_flow'] = le.fit_transform(test['N_pkts_size_per_flow'])
#Select the x and y columns from dataset
xtrain_Val = train.iloc[:,0:6].values
Ytrain = train.iloc[:,6].values
#print Ytrain
#MinMax Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
# Fit the model
Xtrain = scaler.fit_transform(xtrain_Val)
#Reshape data for CNN
Xtrain = Xtrain.reshape((Xtrain.shape[0], 1, 6, 1))
print(Xtrain)
#Xtest = Xtest.reshape((Xtest.shape[0], 1, 6, 1))
#print Xtrain.shape
max_length=70
EMBEDDING_DIM=100
vocab_size=100
num_labels=2
#Define model
def init_model():
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=Xtrain.shape[0]))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(64, activation='softmax'))
model.add(Flatten())
#adam optimizer
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(optimizer = adam, loss='categorical_crossentropy', metrics=['accuracy'])
return model
print('Train...')
model = init_model()
#To avoid overfitting
callbacks = [EarlyStopping('val_loss', patience=3)]
hist = model.fit(Xtrain, Ytrain, epochs=50, batch_size=50, validation_split=0.20, callbacks=callbacks, verbose=1)
#Evaluate model and print results
score, acc = model.evaluate(Xtest, Ytest, batch_size=50)
print('Test score:', score)
print('Test accuracy:', acc)
However, I'm getting the following error:
ValueError: Input 0 is incompatible with layer flatten_1: expected min_ndim=3, found ndim=2
I tried to remove the flatten layers, but getting different error:
ValueError: Error when checking input: expected dense_1_input to have shape (424686,) but got array with shape (6,)
424686 is the number of rows in dataset and 6 is the number of features.
I appreciate any suggestion. Thank you.
Based on Omarfoq suggestion, now I used three labels for both the training and testing datasets. The code and error remains unchanged.
Can anyone please suggest me the solution? Thank you.
I would say that what you are trying is not logical, your model will never predict class "3" if it doesn't exist in the training set. What you are trying have no sense. Try to reformulate your problem.