I'm trying to code a neural network with 3 input nodes, a hidden layer with 4 nodes, and 1 output node. (This will change in the final version, so I've turned them into variables.) Even though I've compared my code to the tutorials, and I can't see anything wrong with it, when I try to run it, it gives me a ValueError saying that it can't dot-multiply these arrays.
As this is my first project using NumPy, I'm at a loss as to what I'm supposed to do.
Here's my code:
import numpy as np
import math
# neurons
n_in = 3
n_hidden = 4
n_out = 1
batchsize = 60
def sigmoid(x, deriv=False):
if deriv:
return x*(1-x)
return 1/(1+np.exp(-x))
def error(expected, actual):
rawError = expected - actual
for cell in rawError:
cell = cell * cell
return rawError
# input data
X = np.array([
[0, 0, 1],
[1, 1, 1],
[1, 0, 1],
[0, 1, 1]
])
# answer data
Y = np.array([0, 1, 1, 0]).T
np.random.seed(0)
# synapses
syn0 = 2 * np.random.random((n_in, n_hidden)) - 1
syn1 = 2 * np.random.random((n_hidden, n_out)) - 1
# train
for j in range(60000):
# feed forward to hidden
l1 = sigmoid(np.dot(X, syn0))
# feed forward to out
l2 = sigmoid(np.dot(l1, syn1))
# calculate error in new array
l2_error = error(Y, l2)
if j % 10000 == 9999:
print(np.sum(l2_error))
# gradient descent:
# multiply the error by the input, then the gradient of sigmoid
l2_nudge = l2_error * sigmoid(l2, deriv=True)
l1_nudge = l2_nudge.dot(syn1.T) * sigmoid(l1, deriv=True)
syn1 += l1.T.dot(l2_nudge)
syn0 += l0.T.dot(l1_nudge)
print(l2)
I expected the program to at least run, but it gives me the following error:
Traceback (most recent call last):
File "neural-network.py", line 68, in <module>
l1_nudge = l2_nudge.dot(syn1.T) * sigmoid(l1, deriv=True)
ValueError: shapes (4,4) and (1,4) not aligned: 4 (dim 1) != 1 (dim 0)
I found my issue: I was doing the dot product for l1_error backwards!
l1_error = np.dot(l2_nudge, syn1.T)
Related
While building some code to train a tensorflow deep model, I am using tensorflow tf.map_fn and tf.py_function as a wrapper to apply a scipy python function as a loss function mapping each 2 rows of a batch of 2 probability vectors p and q of shape [batch_size,num_classes]. When using KL_divergence over this batch of vectors (p,q), the training works fine with this computation and there is no shape incompatibility issue:
tf.reduce_sum(p*(tf.log(p + 1e-16) - tf.log(q + 1e-16)), axis=1) #KL divergence
However, when I tried to use Wasserstein distance or the energy_distance functions from scipy, I get an error dealing with incompatible shapes [] and [5000]. 5000 is here the number of classes (p and q of shape [batch_size, 5000])
import tensorflow as tf
def compute_kld(p_logit, q_logit, divergence_type):
p = tf.nn.softmax(p_logit)
q = tf.nn.softmax(q_logit)
if divergence_type == "KL_divergence":
return tf.reduce_sum(p*(tf.log(p + 1e-16) - tf.log(q + 1e-16)), axis=1)
elif divergence_type == "Wasserstein_distance":
def wasserstein_distance(x,y):
import scipy
from scipy import stats
return stats.wasserstein_distance(x,y)
#tf.function
def func(p,q):
return tf.map_fn(lambda x: tf.py_function(func=wasserstein_distance, inp=[x[0], x[1]], Tout=tf.float32), (p, q), dtype=(tf.float32)) #, parallel_iterations=10)
return func(p, q)
elif divergence_type == "energy_distance": # The Cramer Distancedef energy_distance(x,y):
def energy_distance(x,y):
import scipy
from scipy import stats
return stats.energy_distance(x,y)
#tf.function
def func(p,q):
return tf.map_fn(lambda x: tf.py_function(func=energy_distance, inp=[x[0], x[1]], Tout=tf.float32), (p, q), dtype=(tf.float32)) #, parallel_iterations=10)
return func(p, q)
This is the code to test the loss functions with a batch of 5 and 3 classes, which all work fine individually:
import tensorflow as tf
p = tf.constant([[1, 2, 3], [1, 2, 3], [14, 50, 61], [71, 83, 79], [110,171,12]])
q = tf.constant([[1, 2, 3], [1.2, 2.3, 3.2], [4.2, 5.3, 6.4], [7.5, 8.6, 9.4], [11.2,10.1,13]])
p = tf.reshape(p, [-1,3])
q = tf.reshape(q, [-1,3])
p = tf.cast(p, tf.float32)
q = tf.cast(q, tf.float32)
with tf.Session() as sess:
divergence_type = "KL_divergence"
res = compute_kld(p, q, divergence_type = divergence_type)
divergence_type = "Wasserstein_distance"
res2 = compute_kld(p, q, divergence_type = divergence_type)
divergence_type = "energy_distance"
res3 = compute_kld(p, q, divergence_type = divergence_type)
print("############################## p")
print(sess.run(tf.print(p)))
print("##")
print(sess.run(tf.print(tf.shape(p))))
print("############################## KL_divergence")
print(sess.run(tf.print(res)))
print("##")
print(sess.run(tf.print(tf.shape(res))))
print("############################## Wasserstein_distance")
print(sess.run(tf.print(res2)))
print("##")
print(sess.run(tf.print(tf.shape(res2))))
print("############################## energy_distance")
print(sess.run(tf.print(res3)))
print("##")
print(sess.run(tf.print(tf.shape(res3))))
This is the output:
############################## p
[[1 2 3]
[1 2 3]
[14 50 61]
[71 83 79]
[110 171 12]]
None
##
[5 3]
None
############################## KL_divergence
[0 0.000939823687 0.367009342 1.1647588 3.09911442]
None
##
[5]
None
############################## Wasserstein_distance
[0 0.0126344115 0.204870835 0.237718046 0.120362818]
None
##
[5]
None
############################## energy_distance
[0 0.0917765796 0.41313991 0.438246906 0.316672504]
None
##
[5]
None
However, when using the wasserstein distance or the energy distance inside my training code, I get incompatible shape error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Tried to set a tensor with incompatible shape at a list index. Item element shape: [] list shape: [5000]
[[{{node gradients/TensorArrayV2Read/TensorListGetItem_grad/TensorListSetItem}}]]
I am wondering if the dtype for tf.map_fn or tf.py_function I am using is wrong or if I have to specify/impose shape somewhere ?
Here is a link for the whole code where I tried to replace KL-divergence with Wasserstein distance in method "compute_kld": https://github.com/shenyuanyuan/IMSAT/blob/master/imsat_cluster.py
Thank you in advance for your kind help!
== UPDATE ==
I inspected all the provided batches and the shapes of p and q seem correct
shape(p)
(?, 5000)
shape(q)
(?, 5000)
However, the type of func's returned object is . Thus, I have tried to reshape it with:
return tf.reshape(func(p, q), [p.shape[0]])
However, this doesn't seem to change anything as the error is still the same. After providing the first batch, the code crashes before starting to process the second batch.
Without seeing your training code, what I can help is to fetch the docs and try to shed some light.
map_fn Transforms elems by applying fn to each element unstacked on axis 0.
If elems is a tuple (or nested structure) of tensors, then those tensors must all have the same outer-dimension size (num_elems); and fn is used to transform each tuple (or structure) of corresponding slices from elems. E.g., if elems is a tuple (t1, t2, t3), then fn is used to transform each tuple of slices (t1[i], t2[i], t3[i]) (where 0 <= i < num_elems).
energy_distance Computes the energy distance between two 1D distributions.
wasserstein_distance Computes the first Wasserstein distance between two 1D distributions.
To begin, you hould make sure you are passing only 2D p_logit and q_logit to compute_kld.
I keep getting the error IndexError: tuple index out of range and i am not sure what is happening. My code was working just fine however when i restarted the jupyter notebook i started receiving this error.
this is my code:
X = df.Tweet
y = df.target
from sklearn import linear_model
import pyswarms as ps
# Create an instance of the classifier
classifier = linear_model.LogisticRegression()
# Define objective function
def f_per_particle(m, alpha):
total_features = X.shape[1]
# Get the subset of the features from the binary mask
if np.count_nonzero(m) == 0:
X_subset = X
else:
X_subset = X[:,m==1]
# Perform classification and store performance in P
classifier.fit(X_subset, y)
P = (classifier.predict(X_subset) == y).mean()
# Compute for the objective function
j = (alpha * (1.0 - P)
+ (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))
return j
[some more code]
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}
# Call instance of PSO
dimensions = X.shape[1] # dimensions should be the number of features
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)
# Perform optimization
cost, pos = optimizer.optimize(f, iters=1000)
i received the following traceback:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-76-bea8cf064cd2> in <module>
2
3 # Call instance of PSO
----> 4 dimensions = X.shape[1] # dimensions should be the number of features
5 optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)
6
IndexError: tuple index out of range
It is not absolutely clear, but it seems to me that your df variable might be a Pandas dataframe, and your df.Tweet may be a Pandas series.
In that case, being a series, your X will have only one dimension (so, only the first element of the tuple X.shape, X.shape[0]), instead of two dimensions - reason for the index out of range exception in your code. The two dimensions case occurs only when the variable is a dataframe.
More information: https://www.google.com/amp/s/www.geeksforgeeks.org/python-pandas-series-shape/amp/
I am following a book which has the following code:
import numpy as np
np.random.seed(1)
streetlights = np.array([[1, 0, 1], [0, 1, 1], [0, 0, 1], [1, 1, 1]])
walk_vs_stop = np.array([[1, 1, 0, 0]]).T
def relu(x):
return (x > 0) * x
def relu2deriv(output):
return output > 0
alpha = 0.2
hidden_layer_size = 4
# random weights from the first layer to the second
weights_0_1 = 2*np.random.random((3, hidden_layer_size)) -1
# random weights from the second layer to the output
weights_1_2 = 2*np.random.random((hidden_layer_size, 1)) -1
for iteration in range(60):
layer_2_error = 0
for i in range(len(streetlights)):
layer_0 = streetlights[i : i + 1]
layer_1 = relu(np.dot(layer_0, weights_0_1))
layer_2 = relu(np.dot(layer_1, weights_1_2))
layer_2_error += np.sum((layer_2 - walk_vs_stop[i : i + 1])) ** 2
layer_2_delta = layer_2 - walk_vs_stop[i : i + 1]
layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
weights_1_2 -= alpha * layer_1.T.dot(layer_2_delta)
weights_0_1 -= alpha * layer_0.T.dot(layer_1_delta)
if iteration % 10 == 9:
print(f"Error: {layer_2_error}")
Which outputs:
# Error: 0.6342311598444467
# Error: 0.35838407676317513
# Error: 0.0830183113303298
# Error: 0.006467054957103705
# Error: 0.0003292669000750734
# Error: 1.5055622665134859e-05
I understand everything but this part is not explained and I am not sure why it is the way it is:
weights_0_1 = 2*np.random.random((3, hidden_layer_size)) -1
weights_1_2 = 2*np.random.random((hidden_layer_size, 1)) -1
I don't understand:
Why there is 2* the whole matrix and why is there a -1
If I change 2 to 3 my error becomes greatly lower # Error: 5.616513576418916e-13
I tried changing the 2 to many other numbers along with the change of -1 to many other numbers I get # Error: 2.0 most of the time or the Error is much worst than combination of 3 and -1.
I can't seem to grasp the relationship and the purpose of multiplying the random weights by a number and subracting a number afterwards.
P.S. The idea of the network is to understand a streetlight pattern when people should go and when they should stop depending what combination of the lights in streetlight is on / off.
There is a lot of ways to initialize neural network, and it's a current research subject as it can have a great impact on performance and training time. Some rules of thumb :
avoid having only one value for all weights, as they would all update the same
avoid having too large weights that could make your gradient too high
avoid having too small weights that could make your gradient vanish
In your case, the goal is just to have something between [-1;1] :
np.random.random gives you a float in [0;1]
multiply by 2 gives you something in [0;2]
substract 1 gives you a number in [-1;1]
2*np.random.random((3, 4)) -1 is a way to generated 3*4=12 random number from uniform distribution of half-open interval [-1, +1) i.e including -1 but excluding +1.
This is equivalent to more readable code
np.random.uniform(-1, 1, (3, 4))
I've been working on a simple neural network.
It takes in a data set with 3 columns, if the first column's value is a 1, then the output should be a 1.
I've provided comments so it is easier to follow.
Code is as follows:
import numpy as np
import random
def sigmoid_derivative(x):
return x * (1 - x)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def think(weights, inputs):
sum = (weights[0] * inputs[0]) + (weights[1] * inputs[1]) + (weights[2] * inputs[2])
return sigmoid(sum)
if __name__ == "__main__":
# Assign random weights
weights = [-0.165, 0.440, -0.867]
# Training data for the network.
training_data = [
[0, 0, 1],
[1, 1, 1],
[1, 0, 1],
[0, 1, 1]
]
# The answers correspond to the training_data by place,
# so first element of training_answers is the answer to the first element of training_data
# NOTE: The pattern is if there's a 1 in the first place, the result should be a one
training_answers = [0, 1, 1, 0]
# Train the neural network
for iteration in range(50000):
# Pick a random piece of training_data
selected = random.randint(0, 3)
training_output = think(weights, training_data[selected])
# Calculate the error
error = training_output - training_answers[selected]
# Calculate the adjustments that need to be applied to the weights
adjustments = np.dot(training_data[selected], error * sigmoid_derivative(training_output))
# Apply adjustments, maybe something wrong is going here?
weights += adjustments
print("The Neural Network has been trained!")
# Result of print below should be close to 1
print(think(weights, [1, 0, 0]))
The result of the last print should be close to 1, however it is not?
I have a feeling that I'm not adjusting the weights correctly.
I want to use scipy.signal.fftconvolve in Tensorflow/Keras, is there any way to do that?
Right now I am using the following code :
window = np.tile(window, (1, 1, 1, 3))
tf.nn.conv2d(img1, window, strides=[1,1,1,1], padding='VALID')
Are these lines equivalent to :
signal.fftconvolve(img1, window, mode='valid')
Implementation
FFT convolution can be relatively easily implemented in tensorflow. The following follows scipy.signal.fftconvolve quite strictly
import tensorflow as tf
def _centered(arr, newshape):
# Return the center newshape portion of the array.
currshape = tf.shape(arr)[-2:]
startind = (currshape - newshape) // 2
endind = startind + newshape
return arr[..., startind[0]:endind[0], startind[1]:endind[1]]
def fftconv(in1, in2, mode="full"):
# Reorder channels to come second (needed for fft)
in1 = tf.transpose(in1, perm=[0, 3, 1, 2])
in2 = tf.transpose(in2, perm=[0, 3, 1, 2])
# Extract shapes
s1 = tf.convert_to_tensor(tf.shape(in1)[-2:])
s2 = tf.convert_to_tensor(tf.shape(in2)[-2:])
shape = s1 + s2 - 1
# Compute convolution in fourier space
sp1 = tf.spectral.rfft2d(in1, shape)
sp2 = tf.spectral.rfft2d(in2, shape)
ret = tf.spectral.irfft2d(sp1 * sp2, shape)
# Crop according to mode
if mode == "full":
cropped = ret
elif mode == "same":
cropped = _centered(ret, s1)
elif mode == "valid":
cropped = _centered(ret, s1 - s2 + 1)
else:
raise ValueError("Acceptable mode flags are 'valid',"
" 'same', or 'full'.")
# Reorder channels to last
result = tf.transpose(cropped, perm=[0, 2, 3, 1])
return result
Example
A quick example of applying a gaussian smoothing with width 20 pixels to the standard "face" image is as follows:
if __name__ == '__main__':
from scipy import misc
import matplotlib.pyplot as plt
from tensorflow.python.ops import array_ops, math_ops
session = tf.InteractiveSession()
# Create gaussian
std = 20
grid_x, grid_y = array_ops.meshgrid(math_ops.range(3 * std),
math_ops.range(3 * std))
grid_x = tf.cast(grid_x[None, ..., None], 'float32')
grid_y = tf.cast(grid_y[None, ..., None], 'float32')
gaussian = tf.exp(-((grid_x - 1.5 * std) ** 2 + (grid_y - 1.5 * std) ** 2) / std ** 2)
gaussian = gaussian / tf.reduce_sum(gaussian)
face = misc.face(gray=False)[None, ...].astype('float32')
# Apply convolution
result = fftconv(face, gaussian, 'same')
result_r = session.run(result)
# Show results
plt.figure('face')
plt.imshow(face[0, ...] / 256.0)
plt.figure('convolved')
plt.imshow(result_r[0, ...] / 256.0)
You want just a regular conv2d then...
If you want it somewhere in the model, add a Conv2D(...,name='myLayer') layer, and in the model use model.get_layer('myLayer').set_weights([filters,biases])
If you want it in a loss function, just create a loss function:
import keras.backend as K
def myLoss(y_true, y_pred):
#where y_true is the true training data and y_pred is the model's output
convResult = K.conv2d(y_pred, kernel = window, padding = 'same')
anotherResult = K.depthwise_conv2d(y_pred, kernel = window, padding='same')
The regular conv2D will assume each output channel in the filter will process and sum all input channels.
The depthwise convolution will keep input channels separate.
Beware of the window, though. I don't know the format in tensorflow or scipy, but the kernel in keras should have this shape: (height, width, numberOfInputChannels, numberOfOutputChannels)
I believe, if I understand it right, it should be window = np.reshape(_FSpecialGauss(size, sigma), (size, size, 1, 1)), considering that "size" is the size of the kernel and you have only 1 input and output channels.
I used padding='same' to get the result image the same size of the input. If you use padding='valid', you will lose the borders (although in your case, your filter seems to have size (1,1), which won't remove borders).
You can use any tensorflow function inside the loss function as well:
def customLoss(yTrue,yPred):
tf.anyFunction(yTrue)
tf.anyFunction(yPred)
Using keras backend will let your code be portable to other backends later.
When compiling the model, give it your loss function:
model.compile(loss=myLoss, optimizer =....)