I am new in machine learning and it is my first time to create a linear regression model on a dataset(which is big step for me). I have created my reference rows and reshaped them. Only problem is It is too slow. Is there an any code or better way that I can use. It would be great if you have chance to revise my code.
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
db = pd.read_csv("Melbourne_housing_FULL.csv")
db2 = pd.read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv")
"""['Suburb', 'Address',
'Rooms', 'Type', 'Price',
'Method', 'SellerG',
'Date', 'Distance',
'Postcode', 'Bedroom2', 'Bathroom', 'Car',
'Landsize', 'BuildingArea',
'YearBuilt', 'CouncilArea',
'Regionname', 'Propertycount']
column names for Full.csv"""
rooms_properties = db[["Rooms","Landsize","Bathroom","Car","YearBuilt"]].copy()
rooms_price = db[["Price"]].copy()
room_array_properties = rooms_properties.to_numpy()
room_array_price = rooms_price.to_numpy()
##Splitted list with percentage
def indice_splitter(array_prop,x=0.2):
val = np.random.permutation(len(array_prop))
percent_1 = val[:int(len(val) * x)]
percent_2 = val[int(len(val)*x):]
return percent_2,percent_1
## Converted df as tensor
train_indices,validation_indices = indice_splitter(room_array_price)
train_data,targets1 = room_array_properties[train_indices], room_array_price[train_indices]
validation_data ,targets2 = torch.from_numpy(room_array_properties[validation_indices]).float(), torch.from_numpy(room_array_price[validation_indices]).float()
t_data, tar1 = torch.tensor(train_data,requires_grad=True).float(), torch.tensor(targets1,requires_grad=True).float()
# rooms_price = rooms_price[rooms_price.notnull()]
# r_nonull = [rooms_properties.loc[rooms_properties[i].notnull()] for i in rooms_properties.columns]
# r_nonull = r_nonull[len(r_nonull)-1]
# r_array = r_nonull.to_numpy()
# weight = torch.rand(5,1, dtype=float,requires_grad=True)
# bias = torch.randn(len(train_data),dtype=float,requires_grad=True)
## my model and result
model = nn.Linear(5,1)
weight, bias = model.parameters()
train_ds = TensorDataset(t_data,tar1)
batch_size = 10
train_dl = DataLoader(train_ds,batch_size,shuffle=True)
preds = model(t_data)
loss_fn = F.mse_loss
opt = torch.optim.SGD(model.parameters(), lr= 1e-5)
def fit(num_epochs, model, loss_fn, opt):
for epochy in range(num_epochs):
for xb,yb in train_dl:
#increase the model accuracy
pred = model(xb)
loss = loss_fn(pred,yb)
#upgrade the stoachaistic grad descent
#refresh the data
if (epochy +1) % 10 == 0:
print("{}/{}, Loss:{:.4f}".format(epochy+1,num_epochs,loss.item()))
fit(10,model, loss_fn,opt)
Output is = 10/10, Loss:nan
My expected output should decrease my loss function value every time.
I want to iterate this regression at least 1000 times.
I have 1660Ti , i7 9th Gen, 16gb ram laptop
I made my first Korean chatbot program with python, pytorch and pycharm. It works in my local environment but so slow, So I want to move my codes to Google Colab to make it fast. But I have runtime error : two devices(cuda and cpu) works in same space. I looked for this error and found out that I should upload all of my codes to GPU to work correctly. However, I added .to(device) / .tocuda() something like this for several times but it wasn't worked yet. Please help me. Below this text, this is my whole train codes : Trainer.py and I have problem when call this code to other one. (Import trainer)
import aboutDataSets
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm # 학습 진행률 시각화 1
from time import sleep # 학습 진행률 시각화 2
import re # 정규식 계산
import os
import urllib.request # url로 csv파일 받아오기
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW # optimizer
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
ChatData = pd.read_csv("ChatBotDataMain.csv")
ChatData = ChatData[:300]
# print(ChatData.head())
#dataset 만들기
dataset = aboutDataSets.ChatDataset(ChatData)
batch_size = 32
num_workers = 0
def collate_batch(batch):
data = [item[0] for item in batch]
mask = [item[1] for item in batch]
label = [item[2] for item in batch]
return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)
# 아래 collate_batch 변수때문에 여기 한번 더 호출.
#dataloader 선언
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_set = aboutDataSets.ChatDataset(ChatData, max_len=40)
train_dataLoader = DataLoader(train_set,
lr = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epoch = 10
sneg = -1e18
# 학습 시작
for epoch in tqdm(range(epoch)): # 시각화를 위한 tqdm library
for batch_idx, samples in enumerate(train_dataLoader):
#print(batch_idx, samples)
token_ids, mask, label = samples
out = model(token_ids)
out = out.logits # returns a new tensor with the logit of the elements of input
mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
mask_out = torch.where(mask_3d == 1, out, sneg * torch.ones_like(out))
loss = criterion(mask_out.transpose(2, 1), label)
avg_loss = loss.sum() / mask.sum() # avg_loss[0] / avg_loss[1] <- loss 정규화
# 학습 끝
Replace token_ids, mask, label = samples with token_ids, mask, label = [t.to(device) for t in samples]
This is because the samples generated by the dataloader is on CPU instead of CUDA by default. You have to move them to CUDA before performing forward.
I have medical data collected on 30 patients over 30 times series. The response is categorical and over four categories. Modifying an example in Udemy, PyTorch for Deep Learning with Python Bootcamp, I trained a single LSTM time series model fitting the parameters over the 30 patients. This seems to be working well.
Now, I want to account for covariate information. The devices the patients were fitted with return physiological information at all the points in time and can help predict the response. There is also demographic information in the data set.
I tried with just one quantitative variable, 'mean_heart_X', in the data set, but I don't know how to make use of this information.
Can someone help me out? See my code below:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("C:/Users/Jsmith/mydata.csv")
df = df.dropna(axis = 0, how ='any')
nsubj = 30
subjects = []
for i in range(nsubj):
dfs = df[df['Subject'] == (i+1)]
subjects_ytrain = []
for i in range(nsubj):
y = subjects[i]['Response_var']
y = y.astype('category')
y_train = torch.tensor(y.cat.codes.values).flatten().float()
# How can include covariates as below in my model?
covs_train = []
for i in range(nsubj):
x = subjects[i]['mean.Heart_X']
x = torch.tensor(x.values, dtype = torch.float)
def input_data(seq,ws): # ws is the window size
out = []
L = len(seq)
for i in range(L-ws):
window = seq[i:i+ws]
label = seq[i+ws:i+ws+1]
return out
window_size = 7
# Create the training dataset of sequence/label tuples:
subjects_traindata = []
for i in range(nsubj):
train_data = input_data(subjects_ytrain[i],window_size)
class LSTM(nn.Module):
def __init__(self, input_size=1, hidden_size=50, out_size=4):
self.hidden_size = hidden_size
# Add an LSTM layer:
self.lstm = nn.LSTM(input_size,hidden_size)
# Add a fully-connected layer:
self.linear = nn.Linear(hidden_size,out_size)
# Initialize h0 and c0:
self.hidden = (torch.zeros(1,1,hidden_size),
def forward(self,seq):
lstm_out, self.hidden = self.lstm(
seq.view(len(seq), 1, -1), self.hidden)
pred = self.linear(lstm_out.view(len(seq),-1))
return pred[-1] # we only care about the last prediction
model = LSTM()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
epochs = 10
for i in range(epochs):
for j in range(nsubj):
# tuple-unpack the train_data set
for seq, y_train in subjects_traindata[j]:
# reset the parameters and hidden states
model.hidden = (torch.zeros(1,1,model.hidden_size),
y_pred = model(seq)
loss = criterion(y_pred, y_train)
# print training result
print(f'Epoch: {i+1:2} Loss: {loss.item():10.8f}')
So I have three models created in three different files: Model_A.py, Model_B.py, Model_C.py. Model_A is the first one i have created. When I run Model_A, everything works well. However, when I run Models B or C python still runs Model A. I guessed it has to do with the session, but I am not sure and I have not figured out how to fix it.
Here is Code for model A.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import glob
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #this removes some warning comments. This warning comments express that this PC has a CPU able to
#compute much faster, and that tensorflow was not designed for it. For the moment, will keep it like this. If necessary, we'll use GPU
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, optimizers
from tensorflow.keras.callbacks import Callback
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import warnings
from math import sqrt
from keras import backend as K
#Early stopping based on loss
class EarlyStoppingByLossVal(Callback):
def __init__(self, monitor='val_loss', value=0.00001, verbose=0):
super(Callback, self).__init__()
self.monitor = monitor
self.value = value
self.verbose = verbose
def on_epoch_end(self, epoch, logs={}):
current = logs.get(self.monitor)
if current is None:
warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)
if current < self.value:
if self.verbose > 0:
print("Epoch %05d: early stopping THR" % epoch)
self.model.stop_training = True
NAME = "TensBo{}".format(int(time.time()))
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='logs/{}'.format(NAME))
# # DATA MANAGER: Define a function that imports and defines data
def data_manager(paths, col, row_drop, inout):
test_files = glob.glob(paths[0] + "/*.csv") # keeping directories in a list
n_test = len(test_files) # number of files
q = 0
test = [None]*n_test
for csv in test_files:
pred_data = pd.read_csv(csv, sep=';', encoding='cp1252')
t_step = pred_data.shape[0] # as all data has the same shape, we can keep these values for later use
#((1 t_step-1 inout[0]) (batch_sz t_step-1 inout[0]) (batch_sz t_step-1 inout[0]))
pred_input = np.array(np.reshape(pred_data.drop(columns=col, index=row_drop).values,
(1, t_step-1, inout[0])), dtype='float') #Remove selected columns and indexes. Reshape data
pred_output = np.array(pred_data.loc['1':, col[1]: col[2]], dtype='float')
test[q] = [pred_input, pred_output]
q = q + 1
#Introduce the path and count files
train_files = glob.glob(paths[1] + "/*.csv") #keeping directories in a list
n_files = len(train_files) #number of files
#To check encoding of a file just print its path: with open(r'I:\05_Basanta Franco\Python\Data02\Data1574095060.csv') as f:
inputs = np.zeros([n_files*(t_step-1), inout[0]])
targets = np.zeros([n_files * (t_step-1), inout[1]])
i = t_step-1
j = 0
#import all the csv in files and store them in data
for csv in train_files:
matrix = pd.read_csv(csv, sep=';', encoding='cp1252')
data_in = matrix.drop(columns=col, index=row_drop).values
data_out = matrix.loc['1':, col[1]: col[2]].values
inputs[j:i, :] = data_in
targets[j:i, :] = data_out
i = i + t_step-1
j = j + t_step-1
batch_sz = n_files
# creating input an target tensors of size batch, timestep, inputs
inputs = np.reshape(inputs, (batch_sz, t_step-1, inout[0])) #input selection
targets = np.reshape(targets, (batch_sz, t_step-1, inout[1])) #target selection
return test, inputs, targets, n_test, t_step
test_path = r'I:\05_Basanta Franco\Python\Test'
train_path = r'I:\05_Basanta Franco\Python\Data02'
model_path = r'I:\05_Basanta Franco\Python\model\model01\model{}.h5'
paths = [test_path, train_path, model_path]
col = ['All calculations', 'MSNS-Trafo', 'MSNS-Trafo.1']
row_drop = 0
inout = [11, 2]
test, inputs, targets, n_test, t_step = data_manager(paths, col, row_drop, inout) #test is a list with test inputs and outputs.
# Creating a model, which is a linear stack of layers
model = Sequential()
LSTM layer of n nodes. Shape of the input is the columns of inputs. activation function is rectifier linear function.
Return sequencies = true basically tells the layer to output a sequence. If we were to have another Recurrent layer, this is necessary. Else not, as it would not understand it
Time distribute is important. That basically relates every input step in the input sequence with its corresponding output.
Other way we would just be considering the last value of the sequence
l1 = model.add(layers.LSTM(inout[0], input_shape=(t_step-1, inout[0]), activation='relu', return_sequences=True)) #adding a RNN layer
l3 = model.add(layers.LSTM(30, activation='relu', return_sequences=True)) #adding a RNN layer
l4 = model.add(layers.LSTM(10, activation='relu', return_sequences=True)) #adding a RNN layer
l5 = model.add(layers.Dense(2)) #fully connected layer. What i would understand as a normal layer
opt = optimizers.Adam(lr=1e-03) #how fast the learning rate decays. this helps finding the miminum better
callbacks = [EarlyStoppingByLossVal('val_loss', value=0.002),
ModelCheckpoint(filepath=model_path.format(int(time.time())), save_best_only=True)]
#compiling the model. Defining some of the features for the fit like the type of loss function, the optimizer and metrics that are interesting for us
metrics=['mse', 'mae']) # accuracy only valid for clasiffication tasks
history = model.fit(inputs, targets, epochs=50, validation_split=0.25, callbacks=callbacks)
# Evaluate the model
scores = model.evaluate(inputs, targets, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# print a summary of the outputs of every layer
#The model is saved by modelcheckpoint in a folder. Here, we are saving the models arquitecture in a json file
#model_json = model.to_json()
#with open("model/model01/model.json", "w") as json_file:
# json_file.write(model_json)
t = 1
fig1 = plt.figure()
for prediction in test:
NN_pred = model.predict(prediction[0])
#reshape the prediction for plotting
NN_pred = np.reshape(NN_pred, (prediction[1].shape[0], inout[1]))
prediction[0] = np.reshape(prediction[0], (t_step-1, inout[0]))
#plots: top, predicted and desired test output. down, test inputs
plt.subplot(n_test, 1, t)
plt.title('Test0' + np.str(t))
plt.legend(['I_real_pred', 'I_im_pred', 'Ir', 'Ii'])
# mean squared error
rmse = sqrt(mean_squared_error(prediction[1], NN_pred))
print('Test RMSE: %.3f' % rmse)
t = t + 1
fig2 = plt.figure()
# plot loss during training
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
# plot mse during training
plt.title('Mean Squared Error')
plt.plot(history.history['mse'], label='train')
plt.plot(history.history['val_mse'], label='test')
# print inputs yes or no
printin = input('Print inputs as well? [y/n]: ')
m = True
while m == True:
if printin == 'y':
t = 1
fig3 = plt.figure()
for prediction in test:
plt.title('Inputs: V, P, Q')
plt.subplot(n_test, 1, t)
t = t + 1
m = False
elif printin == 'n':
m = False
printin = input('Answer not valid. Print inputs? [y/n]: ')
And here is codel model B.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import glob
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #this removes some warning comments. This warning comments express that this PC has a CPU able to
#compute much faster, and that tensorflow was not designed for it. For the moment, will keep it like this. If necessary, we'll use GPU
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, optimizers
from tensorflow import Graph
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from keras import backend as K
from Model_A import EarlyStoppingByLossVal
def data_manager_1(paths, col, row_drop, inout):
test_files = glob.glob(paths[0] + "/*.csv") # keeping directories in a list
n_test = len(test_files) # number of files
q = 0
test = [None]*n_test
for csv in test_files:
pred_data = pd.read_csv(csv, sep=';', encoding='cp1252')
t_step = pred_data.shape[0] # as all data has the same shape, we can keep these values for later use
pred_input = np.array(np.reshape(pred_data.drop(columns=col, index=row_drop).values,
(t_step-1, 1, inout[0])), dtype='float') #Remove selected columns and indexes. Reshape data
pred_output = np.array(pred_data.loc['1':, col[1]: col[2]], dtype='float')
test[q] = [pred_input, pred_output]
q = q + 1
#Introduce the path and count files
train_files = glob.glob(paths[1] + "/*.csv") #keeping directories in a list
n_files = len(train_files) #number of files
#To check encoding of a file just print its path: with open(r'I:\05_Basanta Franco\Python\Data02\Data1574095060.csv') as f:
inputs = np.zeros([n_files*(t_step-1), inout[0]])
targets = np.zeros([n_files * (t_step-1), inout[1]])
i = t_step-1
j = 0
#import all the csv in files and store them in data
for csv in train_files:
matrix = pd.read_csv(csv, sep=';', encoding='cp1252')
data_in = matrix.drop(columns=col, index=row_drop).values
data_out = matrix.loc['1':, col[1]: col[2]].values
inputs[j:i, :] = data_in
targets[j:i, :] = data_out
i = i + t_step-1
j = j + t_step-1
batch_sz = n_files
# creating input an target tensors of size batch, timestep, inputs
inputs = np.reshape(inputs, (inputs.shape[0], 1, inout[0])) #input selection
targets = np.reshape(targets, (targets.shape[0], 1, inout[1])) #target selection
return test, inputs, targets, n_test, t_step
test_path = r'I:\05_Basanta Franco\Python\Test'
train_path = r'I:\05_Basanta Franco\Python\Data02'
model_path = r'I:\05_Basanta Franco\Python\model\model02\model{}.h5'
paths = [test_path, train_path, model_path]
col = ['All calculations', 'MSNS-Trafo', 'MSNS-Trafo.1']
row_drop = 0
inout = [11, 2]
test, inputs, targets, n_test, t_step = data_manager_1(paths, col, row_drop, inout) #test is a list with test inputs and outputs.
model02 = Sequential()
l1 = model02.add(layers.LSTM(inout[0], input_shape=(1, inout[0]), activation='relu', return_sequences=True)) #adding a RNN layer
l3 = model02.add(layers.LSTM(5, activation='relu', return_sequences=True)) #adding a RNN layer
l5 = model02.add(layers.Dense(2)) #fully connected layer. What i would understand as a normal layer
#compiling the model. Defining some of the features for the fit like the type of loss function, the optimizer and metrics that are interesting for us
opt = optimizers.Adam(lr=1e-03) #how fast the learning rate decays. this helps finding the miminum better
callbacks = [EarlyStopping('val_loss', patience=20),
ModelCheckpoint(filepath=model_path.format(int(time.time())), save_best_only=True)]
metrics=['mse', 'mae']) # accuracy only valid for clasiffication tasks
# train model and save history
history = model02.fit(inputs, targets, epochs=20, validation_split=0.25, callbacks=callbacks)
# plot loss during training
def train_plots(history):
fig2 = plt.figure()
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
# plot mse during training
plt.title('Mean Squared Error')
plt.plot(history.history['mse'], label='train')
plt.plot(history.history['val_mse'], label='test')
I tried to initialize graphs and sessions for the creation of the models, but it is not working.
Would it be possible(at least as a workaround) to kill each process after it finishes the task? Killing the process would ensure the release of memory of TensorFlow.
If the models need a communication channel/have intermediate results sent over, you could use queues or text files in order to solve this.
Just fixed it. My problem was that I was unable to reset the tensorflow session or clean the Graphs in my session to create and train a different model. I found that the command keras.backend.reset_uids() does that. Thank you anyway!
I was trying to build a neural network with 4 input nodes/ features and just one output feature(0/1). I wrote this code and it runs but while training the model returns NaN. I debugged too and weights and biases are fine until they go through the model.
From what I've searched so far, this could be a problem in the way I am passing the data.
My input data is : tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 1.5340e+00],
[1.5000e+01, 1.0000e-01, 2.4210e+00, 3.0000e+01],
[3.0000e+00, 2.2000e-01, 2.2000e-01, 4.5000e+01],
[1.0000e+00, 2.0000e-02, 2.0000e-02, 1.5000e+01],
[6.0000e+00, 2.0000e-01, 2.0000e-01, 1.5000e+01],
[1.7000e+01, 5.2400e-01, 5.2400e-01, 2.0000e+00]], dtype=torch.float64)
import torch
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from torch.autograd import Variable
# Import tensor dataset & data loader
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
file = pd.read_csv('ks-projects-201801.csv')
array = np.array(file.values)
result = np.empty(len(array))
input_data = np.empty((len(array), 4))
for i in range(len(array)):
input_data[i] = np.array([array[i][10], array[i][12]/1000, array[i][13]/1000, array[i][14]/1000])
if array[i][9] == 'successful':
result[i] = 1
result[i] = 0
input_node = Variable(torch.from_numpy(input_data))
output = torch.from_numpy(result)
train_ds = TensorDataset(input_node.squeeze(), output.squeeze())
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
This is the actual model and training
model = nn.Linear(4, 1)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.003)
epochs = 5
model = model.double()
for e in range(epochs):
running_loss = 0
for xb, yb in train_dl:
res = model(xb)
loss = criterion(res, yb)
running_loss += loss.item()
print(f"model : {loss}")
This prints out model: nan for every epoch and terminates. I am very new to pytorch and I'm not sure how to handle this problem.
If you see NaN's in loss try gradient clipping and data normalisation. Normalising data is a must (i.e normalize input data such that mean = 0 and variance =1)
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#reproducible random seed
seed = 1
#Import and normalize the data
df = pd.read_csv('creditcard.csv')
#Exploring the data
# print df.head()
# print df.describe()
# print df.isnull().sum()
# count_class = pd.value_counts(df['Class'])
# count_class.plot(kind = 'bar')
# plt.title('Fraud class histogram')
# plt.xlabel('class')
# plt.ylabel('Frequency')
# plt.show()
# print('Clearly the data is totally unbalanced!')
#to normalize the amount column
# data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df = df.drop(['Time','V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8','Amount'], axis =1)
X = df.iloc[:,df.columns!='Class']
Y = df.iloc[:,df.columns=='Class']
# number of records in the minority class
number_record_fraud = len(df[df.Class==1])
fraud_indices = np.array(df[df.Class==1].index)
#picking normal class
normal_indices = np.array(df[df.Class==0].index)
#select random x(number_record_fraud) numbers from normal_indices
random_normal_indices = np.random.choice(normal_indices,number_record_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
#under sample data
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
under_sample_data = df.iloc[under_sample_indices,:]
X_undersample = under_sample_data.iloc[:,under_sample_data.columns!='Class']
Y_undersample = under_sample_data.iloc[:,under_sample_data.columns=='Class']
# split data into train and test dataset
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3)
X_train_undersample,X_test_undersample,Y_train_undersample,Y_test_undersample = train_test_split(X_undersample,Y_undersample,test_size=0.3)
learning_rate = 0.05
training_epoch = 10
batch_size = 43
display_step = 1
#tf graph input
x = tf.placeholder(tf.float32,[None,18])
y = tf.placeholder(tf.float32,[None,1])
#set model weights
w = tf.Variable(tf.zeros([18,1]))
b = tf.Variable(tf.zeros([1]))
#construct model
pred = tf.nn.softmax(tf.matmul(x,w) + b) #softmax activation
#minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred),reduction_indices=1))
#Gradient descent
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
#initializing variables
init = tf.global_variables_initializer()
#launch the graph
with tf.Session() as sess:
#training cycle
for epoch in range(training_epoch):
total_batch = len(X_train_undersample)/batch_size
avg_cost = 0
#loop over all the batches
for batch in range(total_batch):
batch_xs = X_train.iloc[(batch)*batch_size:(batch+1) *batch_size]
batch_ys = Y_train.iloc[(batch)*batch_size:(batch+1) *batch_size]
# run optimizer and cost operation
_,c= sess.run([optimizer,cost],feed_dict={x:batch_xs,y:batch_ys})
avg_cost += c/total_batch
correct_prediction = tf.equal(tf.argmax(pred,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
#disply log per epoch step
if (epoch+1) % display_step == 0:
train_accuracy, newCost = sess.run([accuracy, cost], feed_dict={x: X_test,y: Y_test})
print "test_set_accuracy:",accuracy.eval({x:X_test_undersample,y:Y_test_undersample})*100
print "whole_set_accuracy:",accuracy.eval({x:X,y:Y})*100
# print train_accuracy
# print "cost",newCost
print 'optimization finished.'
Things I've tried to figure out what's causing it:
Tried changing train dataset length.
Dropped some not needed fields.
Tried putting validation blocks.
Dataset :link
There can be multiple reasons of why it is overfitting , and as well there can be multiple ways to debug it and to fix it. Its hard to tell just from the code, because it also depends on the data, but here are some common reaons as well as fixes:
Too small dataset, adding more data its a common overfitting fix
Too complex model, if you have many features, or complex polonomial features, try to reducing complexity using feature selection
Add regularization: i dont see regularization in your code, try to add it.