def CreateData(self, n_samples, seed_in=5,
train_prop=0.9, bound_limit=6., n_std_devs=1.96,**kwargs):
np.random.seed(seed_in)
scale_c=1.0 # default
shift_c=1.0
# for ideal boundary
X_ideal = np.linspace(start=-bound_limit,stop=bound_limit, num=50000)
y_ideal_U = np.ones_like(X_ideal)+1. # default
y_ideal_L = np.ones_like(X_ideal)-1.
y_ideal_mean = np.ones_like(X_ideal)+0.5
if self.type_in[:1] == '~':
if self.type_in=="~boston":
path = 'boston_housing_data.csv'
data = np.loadtxt(path,skiprows=0)
elif self.type_in=="~concrete":
path = 'Concrete_Data.csv'
data = np.loadtxt(path, delimiter=',',skiprows=1)
elif self.type_in=="~wind":
path = '/content/Deep_Learning_Prediction_Intervals/code/canada_CSV.csv'
data = np.loadtxt(path,delimiter=',',skiprows=1,usecols = (1,2)) ## CHECK WHTHER TO HAVE LOADTXT OR ANYTHING ELSE PARUL
# work out normalisation constants (need when unnormalising later)
scale_c = np.std(data[:,-1])
shift_c = np.mean(data[:,-1])
# normalise data for ALL COLUMNS
for i in range(0,data.shape[1]): ## i varies from 0 to number of columns ,means it reads one by one the columns
# avoid zero variance features (exist one or two)
# nonlocal X_train, y_train, X_val, y_val ## ADDED BY PARUL
sdev_norm = np.std(data[:,i])
sdev_norm = 0.001 if sdev_norm == 0 else sdev_norm
data[:,i] = (data[:,i] - np.mean(data[:,i]) )/sdev_norm
# split into train/test
perm = np.random.permutation(data.shape[0]) ## DO THE DATA PERMUTATION OF ALL THE ROWS (shuffle)
train_size = int(round(train_prop*data.shape[0]))
train = data[perm[:train_size],:]
test = data[perm[train_size:],:]
y_train = train[:,-1].reshape(-1,1) ## LAST COLUMN IS CONSIDERED AS THE TARGET AND RESHAPED IN BETWEEN -1,1
X_train = train[:,:-1] ## INPUTS ARE ALL EXCEPT LAST COLUMN
y_val = test[:,-1].reshape(-1,1)
X_val = test[:,:-1]
# save important stuff
self.X_train = X_train
self.y_train = y_train
self.X_val = X_val
self.y_val = y_val
self.X_ideal = X_ideal
self.y_ideal_U = y_ideal_U
self.y_ideal_L = y_ideal_L
self.y_ideal_mean = y_ideal_mean
self.scale_c = scale_c
self.shift_c = shift_c
return X_train, y_train, X_val, y_val
It gives me an error
'UnboundLocalError: local variable 'X_train' referenced before assignment'
Any help will be appreciated. I am stuck at this. I have tried initialising X_train with X_train=[] and also tried making them global, but didn't get any result. Please help me so that I could move forward.
Related
Is it possible to look at this code in LSTM? I want to train the data with the shape which I put here but I receive an error regarding the size of the batch I think so. I do not know which size of the batch. currently, the size of a batch that I choose is 64. should I put another size for the batch or the error is not related to the size of the batch?
should I choose for this code: the shape of X (7311, 17, 124) and shape of Y(7311, 1)
InvalidArgumentError: Incompatible shapes: [16] vs. [64]
[[node gradient_tape/binary_crossentropy/weighted_loss/Mul (defined at <ipython-input-74-f95f7e276c58>:1) ]] [Op:__inference_train_function_138498]
df = pd.read_csv("train_data.csv")
timestep = 17 #from 1 to 23 (17 with the current NaN strategy)
threshold_for_classification = -8
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
fill_X = -0.01
seed = 11
#RNN hiperparameter
epochs = 75
batch = 64
val_split = 0.25
test_split = 0.25
lr = 0.0001
adam = optimizers.Nadam() #(lr)
class_weight = {True: 5.,
False: 1.}
verbose = 1
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')
#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
x = event["time_to_tca"].values
return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))
df = df.groupby('event_id').filter(conditions)
#OHE for c_object_type (5 categories) -> 5 new features
df["mission_id"] = df["mission_id"].astype('category')
df["c_object_type"] = df["c_object_type"].astype('category')
df = pd.get_dummies(df)
#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)
#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)
#Getting X as df (dropping rows with tca < 2)
df = df.loc[df["time_to_tca"]>2]
#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform(lambda x: x.value_counts().idxmax())
#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)
#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns
X = np.zeros((events,timestep,features))
X.fill(fill_X)
i = 0
def df_to_3darray(event):
global X, i
#Transforming an event to time series (1,timesteps, columns)
row = event.values.reshape(1,event.shape[0],event.shape[1])
#Condition is needed to slice arrays correctly
#Condition -> is timestep greater than the event's time series length?
if(timestep>=row.shape[1]):
X[i:i+1,-row.shape[1]:,:] = row
else:
X[i:i+1,:,:] = row[:,-timestep:,:]
#index to iterate over X array
i = i + 1
#dataframe remains intact, while X array has been filled.
return event
df.groupby("event_id").apply(df_to_3darray)
#Dropping event_id to remove noise
X = X[:,:,1:]
#TODO: Padding with specific values column-wise instead of zeros.
#TODO: Separating time dependent and independent feature in 2 X arrays
print(X.shape, y.shape)
#computing scaled threshold
th = np.array([threshold_for_classification]).reshape(-1,1)
th = y_scaler.transform(th)
threshold_scaled = th[0,0]
#Splitting arrays
y_boolean = (y > threshold_scaled).reshape(-1,1)
X_train, X_test, y_train_numeric, y_test_numeric = train_test_split(X, y,
stratify=y_boolean,
shuffle=True,
random_state=seed,
test_size = test_split
)
y_train_boolean = (y_train_numeric > threshold_scaled).reshape(-1,1)
X_train, X_val, y_train_numeric, y_val_numeric = train_test_split(X_train, y_train_numeric,
stratify=y_train_boolean,
shuffle=True,
random_state=seed,
test_size = val_split
)
#transforming it into a classification task -> y_train, y_test boolean
y_train = (y_train_numeric > threshold_scaled).reshape(-1,1)
y_val = (y_val_numeric > threshold_scaled).reshape(-1,1)
y_test = (y_test_numeric > threshold_scaled).reshape(-1,1)
X_train = tf.convert_to_tensor(X_train,dtype=tf.int64)
X_test = tf.convert_to_tensor( X_test,dtype=tf.int64)
y_train_numeric = tf.convert_to_tensor(y_train_numeric,dtype=tf.int64)
y_test_numeric = tf.convert_to_tensor(y_test_numeric,dtype=tf.int64)
y_train_boolean = tf.convert_to_tensor(y_train_boolean,dtype=tf.int64)
X_val = tf.convert_to_tensor(X_val,dtype=tf.int64)
y_val_numeric = tf.convert_to_tensor(y_val_numeric,dtype=tf.int64)
y_train = tf.convert_to_tensor(y_train,dtype=tf.int64)
y_val = tf.convert_to_tensor(y_val,dtype=tf.int64)
y_test = tf.convert_to_tensor(y_test,dtype=tf.int64)
y_boolean = tf.convert_to_tensor(y_boolean,dtype=tf.int64)
#Percentage of high risks in train
print("TRAIN {:0.1f}, {:0.1f}, {:0.3f}".format(np.sum(y_train), y_train.shape[0], np.sum(y_train)/y_train.shape[0]))
#Percentage of high risks in val
print("VAL {:0.1f}, {:0.1f}, {:0.3f}".format(np.sum(y_val), y_val.shape[0], np.sum(y_val)/y_val.shape[0]))
#Percentage of high risks in test
print("TEST {:0.1f}, {:0.1f}, {:0.3f}".format(np.sum(y_test), y_test.shape[0], np.sum(y_test)/y_test.shape[0]))
# Model activation selu
input_tensor = Input(batch_shape=(batch, timestep, X_train.shape[2]))
rnn_1 = LSTM(32, stateful=False, dropout=0.15, recurrent_dropout=0.3, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.01))(input_tensor)
batch_1 = BatchNormalization()(rnn_1)
rnn_2 = LSTM(16, stateful=False, dropout=0.15, recurrent_dropout=0.3, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.01))(batch_1)
batch_2 = BatchNormalization()(rnn_2)
rnn_3 = LSTM(8, stateful=False, dropout=0.15, recurrent_dropout=0.3, return_sequences=False, kernel_regularizer=L1L2(l1=0.0, l2=0.01))(batch_2)
batch_3 = BatchNormalization()(rnn_3)
output_tensor = Dense(units = 1, activation='sigmoid')(batch_3)
model = Model(inputs=input_tensor,
outputs= output_tensor)
model.compile(loss='binary_crossentropy',
optimizer=adam,
metrics=['accuracy'])
model.summary()
model_history = model.fit(X_train, y_train,
epochs=epochs,
batch_size=batch,
#shuffle=True, #OJO
validation_data=(X_val, y_val),
verbose=verbose,
class_weight=class_weight
).history
I would suggest changing this line
input_tensor = Input(batch_shape=(batch, timestep, X_train.shape[2]))
to
input_tensor = tf.keras.layers.Input(shape=(timestep, X_train.shape[2]))
and then defining your batch_size in model.fit and make sure X_train and y_train have the same number of samples.
I have one problem concerning lgb. When I write
lgb.train(.......)
it finishes in less than milisecond. (for (10 000,25) ) shape dataset.
and when I write predict, all the output variables have same value.
train = pd.read_csv('data/train.csv', dtype = dtypes)
test = pd.read_csv('data/test.csv')
test.head()
X = train.iloc[:10000, 3:-1].values
y = train.iloc[:10000, -1].values
sc = StandardScaler()
X = sc.fit_transform(X)
#pca = PCA(0.95)
#X = pca.fit_transform(X)
d_train = lgb.Dataset(X, label=y)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
num_round = 10
clf = lgb.train(params, d_train, num_round, verbose_eval=1000)
X_test = sc.transform(test.iloc[:100,3:].values)
pred = clf.predict(X_test, num_iteration = clf.best_iteration)
when I print pred, all the values are (0.49)
It's my first time using lightgbm module. Do I have some error in the code? or I should look for some mismatches in dataset.
Your num_round is too small, it just starts to learn and stops there. Other than that, make your verbose_eval smaller, so see the results visually upon training. My suggestion for you to try the lgb.train code as below:
clf = lgb.train(params, d_train, num_boost_round=5000, verbose_eval=10, early_stopping_rounds = 3500)
Always use early_stopping_rounds since the model should stop if there is no evident learning or the model starts to overfit.
Do not hesitate to ask more. Have fun.
I am using Keras with a TensorFlow backend. I am using the ImageDataGenerator with the validation_split argument to split my data into train set and validation set. As such, I use flow_from_directory with the subset set to "training" and "testing" like so:
total_gen = ImageDataGenerator(validation_split=0.3)
train_gen = data_generator.flow_from_directory(my_dir, target_size=(input_size, input_size), shuffle=False, seed=13,
class_mode='categorical', batch_size=BATCH_SIZE, subset="training")
valid_gen = data_generator.flow_from_directory(my_dir, target_size=(input_size, input_size), shuffle=False, seed=13,
class_mode='categorical', batch_size=32, subset="validation")
This is amazingly convenient, as it allows me to use only one directory instead of two (one for training and one for validation). Now I wonder if it is possible to expand this process in order to generating a third set i.e. test set?
This is not possible out of the box. You should be able to do it with some minor modifications to the source code of ImageDataGenerator:
if subset is not None:
if subset not in {'training', 'validation'}: # add a third subset here
raise ValueError('Invalid subset name:', subset,
'; expected "training" or "validation".') # adjust message
split_idx = int(len(x) * image_data_generator._validation_split)
# you'll need two split indices here
if subset == 'validation':
x = x[:split_idx]
x_misc = [np.asarray(xx[:split_idx]) for xx in x_misc]
if y is not None:
y = y[:split_idx]
elif subset == '...' # add extra case here
else:
x = x[split_idx:]
x_misc = [np.asarray(xx[split_idx:]) for xx in x_misc] # change slicing
if y is not None:
y = y[split_idx:] # change slicing
Edit: this is how you could modify the code:
if subset is not None:
if subset not in {'training', 'validation', 'test'}:
raise ValueError('Invalid subset name:', subset,
'; expected "training" or "validation" or "test".')
split_idxs = (int(len(x) * v) for v in image_data_generator._validation_split)
if subset == 'validation':
x = x[:split_idxs[0]]
x_misc = [np.asarray(xx[:split_idxs[0]]) for xx in x_misc]
if y is not None:
y = y[:split_idxs[0]]
elif subset == 'test':
x = x[split_idxs[0]:split_idxs[1]]
x_misc = [np.asarray(xx[split_idxs[0]:split_idxs[1]]) for xx in x_misc]
if y is not None:
y = y[split_idxs[0]:split_idxs[1]]
else:
x = x[split_idxs[1]:]
x_misc = [np.asarray(xx[split_idxs[1]:]) for xx in x_misc]
if y is not None:
y = y[split_idxs[1]:]
Basically, validation_split is now expected to be a tuple of two floats instead of a single float. The validation data will be the fraction of data between 0 and validation_split[0], test data between validation_split[0] and validation_split[1] and training data between validation_split[1] and 1. This is how you can use it:
import keras
# keras_custom_preprocessing is how i named my directory
from keras_custom_preprocessing.image import ImageDataGenerator
generator = ImageDataGenerator(validation_split=(0.1, 0.5))
# First 10%: validation data - next 40% test data - rest: training data
gen = generator.flow_from_directory(directory='./data/', subset='test')
# Finds 40% of the images in the dir
You will need to modify the file in two or three additional lines (there is a typecheck you will have to change), but that's it and that should work. I have the modified file, let me know if you are interested, I can host it on my github.
I'm attempting to use a Keras Model in a Genetic Algorithm for text classification, however i'm getting an error with pad_sequences where it claims that:
TypeError: pad_sequences() got multiple values for argument 'maxlen'
The actual pad_sequences variable assignment is:
data = self.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
Which is found within:
def get_data(self):
"""Retrieve the dataset and process the data."""
batch_size = 128
VALIDATION_SPLIT = 0.2
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
csv = 'VocabCSV.csv'
my_df = self.pd.read_csv(csv,index_col=0,encoding = 'latin-1')
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
print(my_df.info())
texts = my_df.Text # list of text samples
labellist = my_df.Target # list of labels
label_vals = [] # label values list
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for label in labellist:
if label not in label_vals:
label_vals.append(label)
for idx, text in enumerate(texts):
for label in label_vals:
if label == labellist[idx]:
label_id = label_vals.index(label)
labels_index[text] = label_id
labels.append(label_id)
print("labels index {}".format(len(labels_index)))
print("labels size: %s " % len(labels))
print("found %s texts." % len(texts))
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = self.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = self.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(self.np.asarray(labels).shape)
labels = self.to_categorical(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = self.np.arange(data.shape[0])
self.np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_test = data[-num_validation_samples:]
y_test = labels[-num_validation_samples:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(len(x_test))
print(len(y_test))
input_shape = MAX_SEQUENCE_LENGTH
print(input_shape)
nb_classes = len(label_vals)
return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test, word_index)
The error appears to be occurring whenever get_data is called by another function, however I can't identify what's actually causing it.
The issue is that you have self.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH). The pad_sequences method doesn't belong to your class, but is from keras.preprocessing.sequence.
So if you want it to work correctly, do the import like this:
from keras.preprocessing import sequence
And then call the pad_sequences like this:
sequences = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
For some reason the features of this dataset is being interpreted as rows, "Model n_features is 16 and input n_features is 18189" Where 18189 is the number of rows and 16 is the correct feature list.
The suspect code is here:
for var in cat_cols:
num = LabelEncoder()
train[var] = num.fit_transform(train[var].astype('str'))
train['output'] = num.fit_transform(train['output'].astype('str'))
for var in cat_cols:
num = LabelEncoder()
test[var] = num.fit_transform(test[var].astype('str'))
test['output'] = num.fit_transform(test['output'].astype('str'))
clf = RandomForestClassifier(n_estimators = 10)
xTrain = train[list(features)].values
yTrain = train["output"].values
xTest = test[list(features)].values
xTest = test["output"].values
clf.fit(xTrain,yTrain)
clfProbs = clf.predict(xTest)#Error happens here.
Anyone got any ideas?
Sample training date csv
tr4,42,"JobCat4","divorced","tertiary","yes",2,"yes","no","unknown",5,"may",0,1,-1,0,"unknown","TypeA"
Sample test data csv
tst2,47,"JobCat3","married","unknown","no",1506,"yes","no","unknown",5,"may",0,1,-1,0,"unknown",?
You have a small typo - you created the variable xTest and then are immediately overwriting to something incorrect. Change the offending lines to:
xTest = test[list(features)].values
yTest = test["output"].values