Need help in optimising moving window entropy calculations - python

I am trying to calculate the entropy of 3D patches with sliding windows from a larger 3D array. I can't seem to find a way of optimising the code to run with any reasonable speed.
My current working approach uses nested for loops taking each coord of the larger array and calculating the entropy of the patch with that coord as the starting point.
I'd really like to be able to run this operation in parallel but can't seem to get pool.apply() working, is it possible to run this in parallel?
Here is my working code:
def get_entropy_of_block(data):
value,counts = np.unique(data, return_counts=True)
entropy_of_block = entropy(value, counts)
if np.isnan(entropy_of_block):
entropy_of_block = 0
return entropy_of_block
def output_entropy_versions(mask, window = 5):
mask = np.pad(mask, (0,window - 2), 'edge')
blocks = view_as_windows(mask, (window,window,window),step=1)
entropy_mask = np.zeros(shape=(blocks.shape[0], blocks.shape[1], blocks.shape[2]))
for x in range(blocks.shape[0]):
print(x)
for y in range(blocks.shape[1]):
for z in range(blocks.shape[2]):
entropy_mask[x,y,z] = get_entropy_of_block(blocks[x,y,z,:,:])
return entropy_mask
And here is the parallel attempt
def output_entropy_versions_parallel(mask, window = 5):
mask = np.pad(mask, (0,window - 2), 'edge')
blocks = view_as_windows(mask, (window,window,window),step=1)
entropy_mask = np.zeros(shape=(blocks.shape[0], blocks.shape[1], blocks.shape[2]))
for x in range(blocks.shape[0]):
print(x)
for y in range(blocks.shape[1]):
res = [pool.apply(get_entropy_of_block, args = (blocks[x,y,z,:,:])) for z in range(blocks.shape[2])]
entropy_mask[x,y,:] = res
return entropy_mask
Running this I get the following:
<ipython-input-10-8c3d4ca9d313> in output_entropy_versions(mask, window)
24 print(x)
25 for y in range(blocks.shape[1]):
---> 26 res = [pool.apply(get_entropy_of_block, args = (blocks[x,y,z,:,:])) for z in range(blocks.shape[2])]
27 entropy_mask[x,y,:] = res
28 return entropy_mask
> <ipython-input-10-8c3d4ca9d313> in get_entropy_of_block(data)
10 def get_entropy_of_block(data):
11 value,counts = np.unique(data, return_counts=True)
---> 12 entropy_of_block = entropy(value, counts)
13 if np.isnan(entropy_of_block):
14 entropy_of_block = 0
E:\Anaconda\lib\site-packages\scipy\stats\_distn_infrastructure.py in entropy(pk, qk, base)
2505 """
2506 pk = asarray(pk)
-> 2507 pk = 1.0*pk / np.sum(pk, axis=0)
2508 if qk is None:
2509 vec = entr(pk)
TypeError: unsupported operand type(s) for *: 'float' and 'generator'

Related

Tensorflow Gradient tape 'unknown value for unconnected gradients'

I'm trying to understand why im getting an error when using gradient tape to take the derivative of a function. Try to take the derivative of Power with respect to T, defined as:
import tensorflow as tf
import numpy as np
from scipy.fft import fft, fftfreq, fftn
import tensorflow.python.ops.numpy_ops.np_config as np_config
np_config.enable_numpy_behavior()
#####Initialize Values######
s1 = np.array([[0,1,0],
[1,0,1],
[0,1,0]])
s2 = np.array([[0,-1j,0],
[1j,0,-1j],
[0,1j,0]])
s3 = np.array([[1,0,0],
[0,0,0],
[0,0,-1]])
spin1 = (1/np.sqrt(2))*s1
spin2 = (1/np.sqrt(2))*s2
spin3 = (1/np.sqrt(2))*s3
spin1 = tf.constant(spin1)
spin2 = tf.constant(spin2)
spin3 = tf.constant(spin3)
a = tf.constant(1.0)
b = tf.constant(1.0)
c = tf.constant(1.0)
d = tf.constant(1.0)
v = tf.constant(1.0) # ~N(0,sigma_v)
w = tf.constant(1.0) # ~N(0,sigma_w)
c0_0 = tf.complex(tf.constant(1.0), tf.constant(0.0))
c1_0 = tf.complex(tf.constant(1.0), tf.constant(0.0))
###### Define Functions########
def getDE(T):
D = a*T+b+v
E = c*T+d+w
return D,E
def H(D,E):
return D*(spin3**2 - 2/3) + E*(spin1**2-spin2**2)
def psi(t,eigenvalues,eigenvec1, eigenvec2):
c_0 = np.array(np.exp(-1j*(eigenvalues[0])*t)*c0_0)
c_0.shape = (N,1)
c_1 = np.array(np.exp(-1j*(eigenvalues[1])*t)*c1_0)
c_1.shape = (N,1)
return c_0*(eigenvec1.T)+c_1*(eigenvec2.T)
def forward(T):
T = tf.Variable(T)
with tf.GradientTape() as tape:
D,E = getDE(T)
H_tf = H(D,E)
eigenvalues, eigenstates = tf.linalg.eig(H_tf)
eigenvec1 = eigenstates[:,0]
eigenvec2 = eigenstates[:,1]
wave = psi(t,eigenvalues,eigenvec1, eigenvec2)
a = np.abs(tf.signal.fft2d(wave))**2
Power = np.full([100,1], None)
for i in range(N):
Power[i,:] = a[i,:].conj().T#a[i,:]
return tape.gradient(Power,T)
If someone could tell me if I'm doing this correctly or if there is a better way to do it, as I am not very familiar with auto differentiation in python.
In the forward function taking the derivative of wave with respect to T seems to work, but as soon as I do the fft I get the following error:
WARNING:tensorflow:The dtype of the target tensor must be floating (e.g. tf.float32) when calling GradientTape.gradient, got dtype('O')
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_352/3452884380.py in <module>
----> 1 T_hat = forward(17.0)
2 print(T_hat)
~\AppData\Local\Temp/ipykernel_352/2053063608.py in forward(T)
13 Power[i,:] = a[i,:].conj().T#a[i,:]
14
---> 15 return tape.gradient(Power,T)
~\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\eager\backprop.py in
gradient(self, target, sources, output_gradients, unconnected_gradients)
1072 for x in nest.flatten(output_gradients)]
1073
-> 1074 flat_grad = imperative_grad.imperative_grad(
1075 self._tape,
1076 flat_targets,
~\anaconda3\envs\tensorflow-gpu\lib\site-
packages\tensorflow\python\eager\imperative_grad.py in imperative_grad(tape, target,
sources, output_gradients, sources_raw, unconnected_gradients)
69 "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
70
---> 71 return pywrap_tfe.TFE_Py_TapeGradient(
72 tape._tape, # pylint: disable=protected-access
73 target,
AttributeError: 'numpy.ndarray' object has no attribute '_id'
I hope you have already found an answer to your question. But if you haven't maybe this will give some light.
The problem that you are seen is because Tensorflow can't calculate the gradient of the overall forward I would recommend stopping using NumPy methods.
As long I can see, you can change all those NumPy methods by TensorFlow implemented.
example:
To calculate the magnitude of complex tensor
magnitude = tf.math.abs(complex_tensor)
To use the complex exponential
complex_tensor = tf.math.exp(tf.complex(0.0, -1.0)*tf.cast(phase, "complex64"))
To extract elements on a given dimension
elm1, elm2 = tf.unstack(x, num=2, axis = -1)
To calculate conjugate of a complex tensor
a_conj = tf.math.conj(a)
To transpose or permute the tensor dimensions
x_T = tf.transpose(x, perm = [1, 0])
To summarize, stop using NumPy methods and find the Tensorflow alternatives, that will solve your problems.

Keras CNN training: Cannot use nested list as Input

(Edited to include dataset and model code)
I'm training a Keras CNN 2d matrix. I'm creating my own training dataset, in which each matrix cell has the shape of [[list], int]. The cell's first list item is the product of a string class that I converts to list (using tf.keras.utils.to_categorical):
cell[0] = to_categorical(
rnd_type-1, num_classes=num_types)
the second is a simple int:
cell[1] = random.randint(0, max_val)
The dataset creation function:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
# create empty grid
grid = [[[[],0] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
# get random class
# and convert to cat list
cat = to_categorical(
rnd_type-1, num_classes=num_types)
# get random type
rnd_height = random.randint(0, max_height)
# inject the two values into the cell
grid[i][j] = [cat, rnd_height]
# get some target value
target += rnd_type * 5 + random.random()*5
target_list.append(target)
grids_list.append(grid)
# make np arrs out of the lists
t = np.array(target_list)
g = np.array(grids_list)
return t, g
my model is created using model = models.create_cnn(grid_size, grid_size, 2, regress=True) in which (I assumed) the Input depth is 2.
The model creation code:
num_types = 20
max_height = 50
num_of_samples = 10
grid_size = 10
epochs = 5000
# get n results of X x Y grid with target
targets_list, grids_list = datasets.make_data(
num_of_samples, num_types, max_height, grid_size, grid_size)
split = train_test_split(targets_list, grids_list,
test_size=0.25, random_state=42)
(train_attr_X, test_attr_X, train_grids_X, test_grids_X) = split
# find the largest value in the training set and use it to
# scale values to the range [0, 1]
max_target = train_attr_X.max()
train_attr_Y = train_attr_X / max_target
test_attr_Y = test_attr_X / max_target
model = models.create_cnn(grid_size, grid_size, 2, regress=True)
I however cannot train it given this error: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Answer my own question:
model can only accept int as depth. Therefore, the depth of my matrix must by a list of int len, not a 2D matrix. For that reason, the way to merge class data with continuous field rnd_height is:
class => cat = to_categorical
cell = np.append(cat, [rnd_height])
This way, cat list is added with the rnd_height value.
The whole dataset function now look like this:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
grid = [[[False, False] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
cat = to_categorical(
rnd_type-1, num_classes=num_types)
rnd_height = random.randint(0, max_height)
cell = np.append(cat, [rnd_height])
grid[i][j] = cell
# simulate simple objective function
if rnd_type < num_types/5:
target += rnd_height * 5
target_list.append(target)
grids_list.append(grid)
t = np.array(target_list)
g = np.array(grids_list)
# return grids and targets
return g, t

Why is my code generating a scalar array error despite providing reshape array command?

I am a new python learner. While writing code, from one of online course, I am having an error related to array. I reviewed multiple times, but remained unable to find the error.
Here is the code:
boston_dataset = load_boston()
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
features = data.drop(['INDUS','AGE'], axis=1)
log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices, columns=['PRICE'])
property_stats = features.mean().values.reshape(1,11)
regr = LinearRegression().fit(features, target)
regr.predict(features)
fitted_vals = regr.predict(features)
MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)
CRIME_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
NOX_IDX = 3
RM_IDX = 4
DIS_IDX = 5
RAD_IDX = 6
TAX_IDX = 7
PTRATIO_IDX = 8
B_IDX = 9
LSTAT_IDX = 10
def get_log_estimate(nr_rooms,
students_per_classroom,
next_to_river=False,
high_confidence=True):
property_stats[0][RM_IDX] = nr_rooms
property_stats[0][PTRATIO_IDX] = students_per_classroom
log_estimate = regr.predict(property_stats[0][0])
if next_to_river:
property_stats[0][CHAS_IDX] = 1
else:
property_stats[0][CHAS_IDX] = 0
if high_confidence:
upper_bound = log_estimate + 2*RMSE
lower_bound = log_estimate - 2*RMSE
interval = 95
else:
upper_bound = log_estimate + RMSE
lower_bound = log_estimate - RMSE
interval = 68
return log_estimate, upper_bound, lower_bound, interval
While running these lines of code, I am having this error:
"
ValueError: Expected 2D array, got scalar array instead:
array=3.6135235573122535.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
"
I called below line of code
get_log_estimate(5, 20)
But still getting the same error.
I am also new to Data Science so, I don't know the answer for sure. But in line 8 of your code , I would suggest you to try this: regr.predict([features])

While loop repeat same output when it should be different

I don't understand why the following code output the same random variables for simulated_returns_pr from the SECOND loop (same for the 2 charts from the function). Actually I removed some code but all following variable which should be different are also the same from the SECOND loop. I am missing something but do not understand. Any contribution would be appreciated.
My code:
logR= timeseries
i=1
while i < 5:
simulated_returns_pr= np.random.normal(loc=mean(logR)*30, scale=stdev(logR)*np.sqrt(30.), size=30)
seed = 2
N = 30
def Brownian(seed, N):
np.random.seed(seed)
dt = 1./N # time step
b = simulated_returns_pr*np.sqrt(dt)
W = np.cumsum(b) # brownian path
return W, b
b = Brownian(seed, N)[1]
W = Brownian(seed, N)[0]
W = np.insert(W, 0, 0.)
plt.rcParams['figure.figsize'] = (10,8)
xb = np.linspace(1, len(b), len(b))
plt.plot(xb, b)
plt.title('Brownian Increments')
plt.show()
xw = np.linspace(1, len(W), len(W))
plt.plot(xw, W)
plt.title('Brownian Motion')
plt.show()
i += 1
Output simulated_returns_pr:
[ 0.012191 1.16322303 -0.23225735 -0.12357125 0.35687974 1.02187274
0.25248517 0.74665974 0.54373161 0.43677913 0.69960184 -0.81226681
0.50380517 -0.25108897 0.47459444 0.49541601 0.79958083 -0.20233765
0.5142276 -0.31340253 0.46332258 0.48350956 0.06662023 0.53800548
-0.01440759 -0.23280276 -0.07377719 -0.29948791 0.15798112 0.10707121]
[-0.10796927 0.07350919 -0.97356921 0.9275805 -0.80101665 -0.32191758
0.35499571 -0.52506813 -0.43075947 -0.35577774 0.37944815 1.25577886
0.12274682 -0.4609512 0.37320789 -0.19828379 0.09220437 0.69335439
-0.27465829 0.10637854 -0.3402222 0.02308293 0.2309978 -0.3959363
-0.06873477 -0.01706476 -0.21917336 -0.49603296 -0.61363441 0.02456247]
[-0.10796927 0.07350919 -0.97356921 0.9275805 -0.80101665 -0.32191758
0.35499571 -0.52506813 -0.43075947 -0.35577774 0.37944815 1.25577886
0.12274682 -0.4609512 0.37320789 -0.19828379 0.09220437 0.69335439
-0.27465829 0.10637854 -0.3402222 0.02308293 0.2309978 -0.3959363
-0.06873477 -0.01706476 -0.21917336 -0.49603296 -0.61363441 0.02456247]
[-0.10796927 0.07350919 -0.97356921 0.9275805 -0.80101665 -0.32191758
0.35499571 -0.52506813 -0.43075947 -0.35577774 0.37944815 1.25577886
0.12274682 -0.4609512 0.37320789 -0.19828379 0.09220437 0.69335439
-0.27465829 0.10637854 -0.3402222 0.02308293 0.2309978 -0.3959363
-0.06873477 -0.01706476 -0.21917336 -0.49603296 -0.61363441 0.02456247]

Where am I getting the 'Mean of Empty Slice' error and how do I fix it?

I've got a python program that runs the K-Means algorithm on a set of data. It is for a homework assignment. It has to be built without using the built in kmeans function from sclearn.
Basically, I set a value for qty and that sets the amount of centroids/clusters. It then creates random x and y points to use as my clusters. Sometimes it runs without error, and other times it give me this:
Warning (from warnings module):
File "C:\Python27\lib\site-packages\numpy\core\_methods.py", line 59
warnings.warn("Mean of empty slice.", RuntimeWarning)
RuntimeWarning: Mean of empty slice.
Warning (from warnings module):
File "C:\Python27\lib\site-packages\numpy\core\_methods.py", line 68
ret, rcount, out=ret, casting='unsafe', subok=False)
RuntimeWarning: invalid value encountered in true_divide
Here is my code:
import numpy as np
from pprint import pprint
import random
import sys
dataPoints = np.array([[2,4],[17,4],[45,2],[45,7],[16,32],[32,14],[20,56],[68,33],[54,36],[3,54],[23,5],[56,23],[10,81],[64,15],[23,18],[22,15],[35,19],[66,19],[1,99]])
rangeX = (0, 100)
rangeY = (0, 100)
qty = 5
randomCentroids = []
i = 0
while i<qty:
x = random.randrange(*rangeX)
y = random.randrange(*rangeY)
randomCentroids.append((x,y))
i += 1
centroids = np.asarray(randomCentroids)
def size(vector):
return np.sqrt(sum(x**2 for x in vector))
def distance(vector1, vector2):
return size(vector1 - vector2)
def distances(array1, array2):
ConvergenceCounter = 1
keepGoing = True
StartingCentroids = np.copy(centroids)
while keepGoing:
#--------------Find The new means---------#
np.linalg.norm(StartingCentroids[None, :, :] - dataPoints[:, None, :], axis=-1)
t0 = StartingCentroids[None, :, :] - dataPoints[:, None, :]
t1 = np.linalg.norm(t0, axis=-1)
t2 = np.argmin(t1, axis=-1)
cat = np.mean(dataPoints[t2 == 0], axis=0)
#------Push the new means to a new array for comparison---------#
CentroidMeans = []
for x in xrange(len(StartingCentroids)):
CentroidMeans.append(np.mean(dataPoints[t2 == [x]], axis=0))
#--------Convert to a numpy array--------#
NewMeans = np.asarray(CentroidMeans)
#------Compare the New Means with the Starting Means------#
if np.array_equal(NewMeans,StartingCentroids):
print ('Convergence has been reached after {} moves'.format(ConvergenceCounter))
print ('Starting Centroids:\n{}'.format(centroids))
print ('Final Means:\n{}'.format(NewMeans))
print ('Final Cluster assignments: {}'.format(t2))
for x in xrange(len(StartingCentroids)):
print ('Cluster {}:\n'.format(x)), dataPoints[t2 == [x]]
for x in xrange(len(StartingCentroids)):
print ('Size of Cluster {}:'.format(x)), len(dataPoints[t2 == [x]])
keepGoing = False
else:
ConvergenceCounter = ConvergenceCounter +1
StartingCentroids =np.copy(NewMeans)
distances(centroids,dataPoints)
Using a smaller number for the qty variable seems to work without error.

Categories