Multiple linear regression with gradient descent - python

Halo,
I'm new in machine learning and Python and I want to predict the Kaggle House Sales in King County dataset with my gradient descent.
I'm splitting 70% (15k rows) training and 30% (6k rows) testing and I choose 5 features from 19, but there is a performance issue, the algorithm took so much time (more than 11 hours), 100% memory and failed to execute.
This is my Gradient Descent class:
class GradientDescent:
X_train = []
Y_train = []
X_test = []
Y_test = []
lr = 0
max_iter = 0
theta = 0
def __init__(self, X_train,Y_train,X_test,Y_test, lr=0.01, max_iter=100):
self.X_train = X_train
self.Y_train = Y_train
self.X_test = X_test
self.Y_test = Y_test
self.lr = lr
self.max_iter = max_iter
self.theta = np.random.randn(X_train.shape[1], 1)
print(self.theta)
def costFunction(self,theta,X,y):
"1/2m * E(h0-y)**2"
m = len(y)
y_pred = X.dot(theta)
cost = (1/2*m) * np.sum(np.square(y_pred-y))
return cost
def estimate(self):
m = len(self.Y_train)
mse_hist = np.zeros(self.max_iter)
#theta_hist = np.zeros(max_iter)
i = 0
while i < self.max_iter or mse_hist[i] > 0.01:
y_pred = np.dot(self.X_train,self.theta)
error = y_pred-self.Y_train
self.theta = self.theta - (1/m)*self.lr*(self.X_train.T.dot((error)))
mse_hist[i] = self.costFunction(self.theta,self.X_train, self.Y_train)
#print(mse_hist[i])
i+=1
return (self.theta, mse_hist)
def test(self):
res = pd.DataFrame()
for i,row in self.X_test.iterrows():
price_pred = np.dot(row.values,self.theta)
res = row
res['price_actual'] = self.Y_test[i]
res['price_predict'] = price_pred
res['r2_score'] = r2_score(res['price_actual'].values, res['price_predict'])
res.to_csv('output.csv')
Any advice to make it better?

In general the code seems fine though I haven't tested it. The only error I could find was that you may not be incrementing i in your while loop so the loop is never exiting.

Related

Why does the simplest neural network work worse than logistic regression

I'm new to neural networks, and I'm trying to write my first network.
Here's the data I'm trying to train her on.
X, y = make_classification(n_samples=1024,n_features=2, n_informative=2, n_redundant=0, n_classes=2, random_state=2)
Generated data
Here is my network:
def f(x):
return 1/(1+np.exp(-x))
def df(x):
return f(x)*(1-f(x))
def init():
np.random.seed(42)
W1 = np.random.rand(2,2)
W2 = np.random.rand(1,2)
return W1,W2
def go_forward(inp,W1,W2):
sum = np.dot(W1,inp)
out = np.array([f(x) for x in sum])
sum_1 = np.dot(W2,out)
out_1 = f(sum_1)
y = 1 if out_1>0.5 else 0
return (y,out)
def train_one_epoch(X_train,y_train,W2,W1,lr):
err = []
for k in range(len(X_train)):
ind = np.random.randint(0,len(X_train))
x,y = X_train[ind],y_train[ind]
pred, out = go_forward(x,W1,W2)
e = pred - y
delta = e*df(pred)
W2 = W2 - lr*delta*out
delta2 = W2*delta*df(out)
W1 = W1 - np.array(x)*delta2*lr
err.append(e)
return W1,W2,np.mean(np.array(err))
epoch = 100
lr = 0.01
err = []
W1,W2 = init()
for e in range(epoch):
W1,W2,e = train_one_epoch(X_train,y_train,W2,W1,lr)
err.append(e)
The error graph is bothering me
error graph
here is an accuracy comparison
My Network - 0.751953125
sklearn.linear_model.LogisticRegression - 0.974609375
Test data
My Network Results
Results sklearn.linear_model.LogisticRegression
I'm at a loss why the network can't solve such a simple problem
I have tried changing the number of epochs, but it does not lead to anything.

When testing a model written from scratch, how do you predict the new outputs?

Would I use the weights and Biases that's been saved by the training model?
Below is the code. Do I just call in predict with the new test data?? .predict(X_test)
where X is the input array. I am trying to understand the math and how the testing work for neural networks instead of using TensorFlow or Pytorch.
Thanks!
class AdalineSGD:
def __init__(self,eta = 0.01, n_iter = 10, random_state = None,
shuffle = True):
self.eta = eta
self.n_iter = n_iter
self.random_state = random_state
self.w_initializaed = False
self.shuffle = shuffle
self.random_state = random_state
def fit(self,X,y):
self._initialize_weights(X.shape[1])
self.cost_ = []
for _ in range(self.n_iter):
if self.shuffle:
X,y = self._shuffle(X,y)
cost = []
for xi, target in zip(X,y):
cost.append(self._update_weights(xi,target))
avg_cost = sum(cost) / len(y)
self.cost_.append(avg_cost)
return self
def partial_fit(self,X,y):
if not self.w_initialized:
self._initialize_weights(X.shape[1])
if y.ravel().shape[0] > 1:
for xi,target in zip(X,y):
self._update_weights(xi, target)
else:
self._update_weights(X,y)
return self
def _shuffle(self,X,y):
r = self.rgen.permutation(len(y))
return(X[r], y[r])
def _initialize_weights(self,m):
self.rgen = np.random.RandomState(self.random_state)
self.w_ = self.rgen.normal(loc= 0.0, scale = 0.01,size = 1+m)
self.w_initialized = True
def _update_weights(self,xi,target):
output = self.activation(self.net_input(xi))
error = (target - output)
self.w_[1:] += self.eta * xi.dot(error)
self.w_[0] += self.eta * error
cost = 0.5 * error**2
return cost
def net_input(self,X):
return np.dot(X,self.w_[1:]) + self.w_[0]
def activation(self,X):
return X
def predict(self, X):
return np.where(self.net_input(X) >= 0, 1,0)
Would I use the weights and Biases that's been saved by the training model?
Yes. This happens when you call .fit()
Do I just call in predict with the new test data?? .predict(X_test)
Again, yes
I am trying to understand the math and how the testing work for neural networks
What happens when we train a neural network? We are finding the weights that best approximate a function that minimises the loss for our problem.
Once you have the right weights, which you do by fitting, you can just pass an input through those weights and get a corresponding output.

RBF-Neural net can't classify MNIST dataset

I have implemented a RBF neural network classifier.
I use my implementation to classify the MNIST dataset, but it is not learning and always just predicts a single class. I would be very grateful if someone could help me identify the problem with my implementation.
I have to note that the implementation is quite slow due to the fact it works example by example, but I don't know how to make it such that it works batch by batch. (I am new to tensorflow and python in general)
My implementation is as follows:
class RBF_NN:
def __init__(self, M, K, L, lr):
#Layer sizes
self.M = M #input layer size - number of features
self.K = K #RBF layer size
self.L = L #output layer size - number of classes
#
x = tf.placeholder(tf.float32,shape=[M])
matrix = tf.reshape(tf.tile(x,multiples=[K]),shape=[K,M])
prototypes_input = tf.placeholder(tf.float32,shape=[K,M])
prototypes = tf.Variable(prototypes_input) # prototypes - representatives of the data
r = tf.reduce_sum(tf.square(prototypes-matrix),1)
s = tf.Variable(tf.random.uniform(shape=[K],maxval=1)) #scaling factors
h = tf.exp(-r/(2*tf.pow(s,2)))
W = tf.Variable(tf.random.uniform(shape=[K,L],maxval=1))
b = tf.Variable(tf.constant(0.1, shape=[L]))
o = tf.matmul(tf.transpose(tf.expand_dims(h,1)),W) + b
pred_class = tf.argmax(o,1)
y = tf.placeholder(shape=[L], dtype=tf.float32)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=o, labels=y))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
self.x = x
self.prototypes_input = prototypes_input
self.prototypes = prototypes
self.r = r
self.s = s
self.h = h
self.W = W
self.b = b
self.o = o
self.y = y
self.loss = loss
self.optimizer = optimizer
self.pred_class = pred_class
def fit(self,X,y,prototypes,epoch_count,print_step,sess):
for epoch in range(epoch_count):
epoch_loss = 0
for xi,yi in zip(X,y):
iter_loss, _ = sess.run((self.loss,self.optimizer),feed_dict={self.x: xi, self.y: yi, self.prototypes_input:prototypes})
epoch_loss = epoch_loss + iter_loss
epoch_loss = epoch_loss/len(X)
if epoch%print_step == 0:
print("Epoch loss",(epoch+1),":",epoch_loss)
def predict(self,x,sess):
return sess.run((self.pred_class),feed_dict={self.x:x})[0]
def get_prototypes(self,sess):
return sess.run((self.prototypes))
Usage:
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
y_train = to_one_hot(y_train,10)
y_test = to_one_hot(y_test,10)
x_train = np.asarray([np.asarray(x).reshape(-1) for x in x_train])
x_test = np.asarray([np.asarray(x).reshape(-1) for x in x_test])
M = 784
K = 1000
L = 10
lr = 0.01
rbfnn = RBF_NN(M,K,L,lr)
#Selecting prototypes from the train set
idx = np.random.randint(len(x_train), size=K)
prototypes = x_train[idx,:]
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init,feed_dict={rbfnn.prototypes_input:prototypes})
rbfnn.fit(x_train,y_train,prototypes,epoch_count=1, print_step=1,sess=sess)
y_test_p = []
for xi,yi in zip(x_test,y_test):
yp = rbfnn.predict(xi,sess=sess)
y_test_p.append(yp)
y_test_t = [np.argmax(yi) for yi in y_test]
acc = accuracy_score(y_test_t,y_test_p,)
precc = precision_score(y_test_t,y_test_p, average='macro')
recall = recall_score(y_test_t,y_test_p, average = 'macro')
f1 = f1_score(y_test_t,y_test_p,average='macro')
print("Accuracy:",acc)
print("Precision:",precc)
print("Recall:",recall)
print("F1 score:",f1)
sess.close()
The implementation is fine. However, it seems to be very sensitive to the data.
It will start learning just fine if the following lines are added:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
In this way the data is normalized so that the interval of each feature is from 0 to 1.

lightGBM predicts same value

I have one problem concerning lgb. When I write
lgb.train(.......)
it finishes in less than milisecond. (for (10 000,25) ) shape dataset.
and when I write predict, all the output variables have same value.
train = pd.read_csv('data/train.csv', dtype = dtypes)
test = pd.read_csv('data/test.csv')
test.head()
X = train.iloc[:10000, 3:-1].values
y = train.iloc[:10000, -1].values
sc = StandardScaler()
X = sc.fit_transform(X)
#pca = PCA(0.95)
#X = pca.fit_transform(X)
d_train = lgb.Dataset(X, label=y)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
num_round = 10
clf = lgb.train(params, d_train, num_round, verbose_eval=1000)
X_test = sc.transform(test.iloc[:100,3:].values)
pred = clf.predict(X_test, num_iteration = clf.best_iteration)
when I print pred, all the values are (0.49)
It's my first time using lightgbm module. Do I have some error in the code? or I should look for some mismatches in dataset.
Your num_round is too small, it just starts to learn and stops there. Other than that, make your verbose_eval smaller, so see the results visually upon training. My suggestion for you to try the lgb.train code as below:
clf = lgb.train(params, d_train, num_boost_round=5000, verbose_eval=10, early_stopping_rounds = 3500)
Always use early_stopping_rounds since the model should stop if there is no evident learning or the model starts to overfit.
Do not hesitate to ask more. Have fun.

Why does cost function for MLP flatten?

I am very new to machine learning and am trying to implement an MLP however the cost function seems to be reaching a local minimum before reaching the global minimum. I plotted the cost as a function of iteration (including a 0 value as to not be fooled by where the y-axis starts). Here is the code that I am using at my attempt:
import numpy as np
class NNet(object):
def __init__(self, n_in, n_hidden, n_out):
self.n_in = n_in
self.n_hidden = n_hidden
self.n_out = n_out
self.W1 = np.random.randn(n_in, n_hidden)
self.W2 = np.random.randn(n_hidden, n_out)
self.b1 = np.random.randn(n_hidden,)
self.b2 = np.random.randn(n_out,)
def sigmoid(self, z):
return 1/(1 + np.exp(-z))
def sig_prime(self, z):
return (np.exp(-z))/((1+np.exp(-z))**2)
def propagate_forward(self, X):
self.z1 = np.dot(self.W1.T, X) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.W2.T, self.a1) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def cost(self, y, y_hat):
return np.mean([np.sum((y[i] - y_hat[i])**2) for i in range(y.shape[0])])/2
def cost_grad(self, X, y):
y_hat = self.propagate_forward(X)
d2 = np.multiply(self.sig_prime(self.z2), -(y - y_hat))
gJ_W2 = np.matrix(np.multiply(self.a1.T, d2))
d1 = np.dot(self.W2, d2)*self.sig_prime(self.z1)
gJ_W1 = np.dot(np.matrix(X).T, np.matrix(d1))
return [gJ_W1, d1, gJ_W2, d2]
m = 1000
n = 1
X = np.zeros((m, n))
y = np.zeros((m,1))
import random
import math
i = 0
for r, theta in zip(np.linspace(0, 5, num=m), np.linspace(0, 8 * math.pi, num=m)):
r += random.random()
X[i] = [r * math.cos(theta), r * math.sin(theta)]
if i < 333:
y[i] = 0
elif i < 666:
y[i] = 1
else:
y[i] = 2
i += 1
nnet = NNet(n, 5, 1)
learning_rate = 0.2
improvement_threshold = 0.995
cost = np.inf
xs = []
ys = []
iter = 0
while cost > 0.2:
cost = nnet.cost(y, [nnet.propagate_forward(x_train) for x_train
if iter % 100 == 0:
xs.append(iter)
ys.append(cost)
print("Cost", cost)
if iter >= 1000:
print("Gradient descent is taking too long, giving up.")
break
cost_grads = [nnet.cost_grad(x_train, y_train) for x_train, y_train in zip(X, y)]
gW1 = [grad[0] for grad in cost_grads]
gb1 = [grad[1] for grad in cost_grads]
gW2 = [grad[2] for grad in cost_grads]
gb2 = [grad[3] for grad in cost_grads]
nnet.W1 -= np.mean(gW1, axis=0)/2 * learning_rate
nnet.b1 -= np.mean(gb1, axis=0)/2 * learning_rate
nnet.W2 -= np.mean(gW2, axis=0).T/2 * learning_rate
nnet.b2 -= np.mean(gb2, axis=0)/2 * learning_rate
iter += 1
Why is the cost not improving after a certain point? Also any other tips are highly appreciated.
The generated toy dataset looks like this
Your goal seems to be to predict to which class {0,1,2} belongs the data.
The output of your net is a sigmoid (sigm(x) in [0,1]) and you're
training using mean squared error (MSE), it's impossible for the model to predict a value above 1. So it's always wrong when the class to predict is 2.
The cost probably flattens because your sigmoid unit saturate (when trying to predict 2) and the gradient for saturating sigmoid is 0
For classification neural net normally end with a softmax layer and
are trained using cross-entropy.
If you want to keep using MSE and sigmoids unit for classification, you should consider predicting only two classes at a time in a One-vs-(One/All) kinda way.
Anyway, if you only do bi-class classification by rounding output to 0 or 1,it seems to work. Cost is decreasing and accuracy rising (quickly modified code):

Categories