In this code i want to create time series for each cowId. Result of the function split_df_sequence is a two-dimensional array X. How to optimize concatenation of all this 2d arrays in one 2d array?
steps = 12
def split_df_sequence(df, steps):
X, y = list(), list()
if len(df) <= steps:
return -1
for i in range(len(df)):
end = steps + i
if end > len(df) - 1:
break
seq_X = list(df['movement'][i:end])
seq_y = df['target'][end]
X.append(seq_X)
y.append(seq_y)
return X, y
ts_movement = []
ts_target = []
for ID in DF.cowId.unique():
df = DF[DF.cowId == ID].reset_index()
data = split_df_sequence(df, steps)
if data == -1:
continue
X, y = data
for x in X:
ts_movement.append(x)
ts_target.append(y)
ts_movement = np.array(ts_movement)
ts_target = np.array(ts_target)
this is the most significant part of code, that should be optimized:
for x in X:
ts_movement.append(x)
Related
I am having a problem updating my x0x1 array with the new values in my while loop. When I print my array out to check each generation iteration, its printing out the same values each time (despite me getting different z vector values). Not sure why.
I am getting the z-values for each iteration in my for loop, but for some reason, my newly generated x0x1 arrays each generation of the while loop are not being updated.
import numpy as np
import numpy.random
def calculateFunctionValueMatrix(x0x1, N):
functionValueArray = np.zeros((N-1, 1))
for i in range(0, N-1):
functionValueArray[i] = calculateFunctionValue(x0x1[i])
return functionValueArray
def calculateFunctionValue(x):
function = (x[0]-1)**2 + 5*((x[1]-x[0]**2)**2)
return function
def x0x1Array(N):
x0x1 = np.random.uniform(-2, 2, size = (N-1,2))
generateCandidateVector(x0x1)
def generateCandidateVector(x0x1):
print(x0x1)
K = 10
F= 0.8
N = 10
generation = 1
while generation <= K:
for i in range(0, N-1):
x0_ind, x1_ind, x2_ind = np.random.choice(len(x0x1), 3)
x0 = x0x1[x0_ind]
x1 = x0x1[x1_ind]
x2 = x0x1[x2_ind]
vectorZ = x0 + F*(x1-x2)
print("this is vector", vectorZ)
if(calculateFunctionValue(vectorZ) < calculateFunctionValue(x0x1[i])):
vectorZ = x0x1[i]
elif(calculateFunctionValue(vectorZ) > calculateFunctionValue(x0x1[i])):
x0x1[i] = x0x1[i]
print(x0x1)
if(np.std(calculateFunctionValueMatrix(x0x1, N)) < 0.01):
print("Optimal Solution Found")
generation = generation + 1
def main():
N=50
x0x1Array(N)
main()
I'm trying to calculate the mean value of a quantity(in the form of a 2D array) as a function of its distance from the center of a 2D grid. I understand that the idea is that I identify all the array elements that are at a distance R from the center, and then add them up and divide by the number of elements. However, I'm having trouble actually identifying an algorithm to go about doing this.
I have attached a working example of the code to generate the 2d array below. The code is for calculating some quantities that are resultant from gravitational lensing, so the way the array is made is irrelevant to this problem, but I have attached the entire code so that you could create the output array for testing.
import numpy as np
import multiprocessing
import matplotlib.pyplot as plt
n = 100 # grid size
c = 3e8
G = 6.67e-11
M_sun = 1.989e30
pc = 3.086e16 # parsec
Dds = 625e6*pc
Ds = 1726e6*pc #z=2
Dd = 1651e6*pc #z=1
FOV_arcsec = 0.0001
FOV_arcmin = FOV_arcsec/60.
pix2rad = ((FOV_arcmin/60.)/float(n))*np.pi/180.
rad2pix = 1./pix2rad
Renorm = (4*G*M_sun/c**2)*(Dds/(Dd*Ds))
#stretch = [10, 2]
# To create a random distribution of points
def randdist(PDF, x, n):
#Create a distribution following PDF(x). PDF and x
#must be of the same length. n is the number of samples
fp = np.random.rand(n,)
CDF = np.cumsum(PDF)
return np.interp(fp, CDF, x)
def get_alpha(args):
zeta_list_part, M_list_part, X, Y = args
alpha_x = 0
alpha_y = 0
for key in range(len(M_list_part)):
z_m_z_x = (X - zeta_list_part[key][0])*pix2rad
z_m_z_y = (Y - zeta_list_part[key][1])*pix2rad
alpha_x += M_list_part[key] * z_m_z_x / (z_m_z_x**2 + z_m_z_y**2)
alpha_y += M_list_part[key] * z_m_z_y / (z_m_z_x**2 + z_m_z_y**2)
return (alpha_x, alpha_y)
if __name__ == '__main__':
# number of processes, scale accordingly
num_processes = 1 # Number of CPUs to be used
pool = multiprocessing.Pool(processes=num_processes)
num = 100 # The number of points/microlenses
r = np.linspace(-n, n, n)
PDF = np.abs(1/r)
PDF = PDF/np.sum(PDF) # PDF should be normalized
R = randdist(PDF, r, num)
Theta = 2*np.pi*np.random.rand(num,)
x1= [R[k]*np.cos(Theta[k])*1 for k in range(num)]
y1 = [R[k]*np.sin(Theta[k])*1 for k in range(num)]
# Uniform distribution
#R = np.random.uniform(-n,n,num)
#x1= np.random.uniform(-n,n,num)
#y1 = np.random.uniform(-n,n,num)
zeta_list = np.column_stack((np.array(x1), np.array(y1))) # List of coordinates for the microlenses
x = np.linspace(-n,n,n)
y = np.linspace(-n,n,n)
X, Y = np.meshgrid(x,y)
M_list = np.array([0.1 for i in range(num)])
# split zeta_list, M_list, X, and Y
zeta_list_split = np.array_split(zeta_list, num_processes, axis=0)
M_list_split = np.array_split(M_list, num_processes)
X_list = [X for e in range(num_processes)]
Y_list = [Y for e in range(num_processes)]
alpha_list = pool.map(
get_alpha, zip(zeta_list_split, M_list_split, X_list, Y_list))
alpha_x = 0
alpha_y = 0
for e in alpha_list:
alpha_x += e[0]
alpha_y += e[1]
alpha_x_y = 0
alpha_x_x = 0
alpha_y_y = 0
alpha_y_x = 0
alpha_x_y, alpha_x_x = np.gradient(alpha_x*rad2pix*Renorm,edge_order=2)
alpha_y_y, alpha_y_x = np.gradient(alpha_y*rad2pix*Renorm,edge_order=2)
det_A = 1 - alpha_y_y - alpha_x_x + (alpha_x_x)*(alpha_y_y) - (alpha_x_y)*(alpha_y_x)
abs = np.absolute(det_A)
I = abs**(-1.)
O = np.log10(I+1)
plt.contourf(X,Y,O,100)
The array of interest is O, and I have attached a plot of how it should look like. It can be different based on the random distribution of points.
What I'm trying to do is to plot the mean values of O as a function of radius from the center of the grid. In the end, I want to be able to plot the average O as a function of distance from center in a 2d line graph. So I suppose the first step is to define circles of radius R, based on X and Y.
def circle(x,y):
r = np.sqrt(x**2 + y**2)
return r
Now I just have to figure out a way to find all the values of O, that have the same indices as equivalent values of R. Kinda confused on this part and would appreciate any help.
You can find the geometric coordinates of a circle with center (0,0) and radius R as such:
phi = np.linspace(0, 1, 50)
x = R*np.cos(2*np.pi*phi)
y = R*np.sin(2*np.pi*phi)
these values however will not fall on the regular pixel grid but in between.
In order to use them as sampling points you can either round the values and use them as indexes or interpolate the values from the near pixels.
Attention: The pixel indexes and the x, y are not the same. In your example (0,0) is at the picture location (50,50).
I am wondering why y is being returned as an array. If it helps, x is a numpy array.
I have not gotten to the bit where I have to store the y values in an array yet, but I am trying to find the average of N_obs_photon(t) over a finely scaled grid to give a smoothed graph of N_obs_photon(t) over a coarsely scaled grid. I think that the part with the sum(array)/len(array) should be giving me a float but instead it is returning an array (more correctly 1001 arrays). Even if I try setting x to some array with only one value, y is still being returned as an array. And how should I write it so that I am being given the average for y?
x = np.linspace(0,1,1001)
ftc = 11
L = 10**(-3)
i = x/L
j = np.arange(0,11001,1) #end number is max(i)*ftc + 1
'''reference:
fine_grid_number = j
coarse_grid_number = i
coase_grid_width = L
fine_grid_width = L/ftc #denominator is ftc
coarse_grid_position = i*L
fine_grid_position = j*L/ftc #denominator is ftc'''
#Plan is to create an array of y values, then plot x vs y
for n in i:
q = np.arange(-0.5*(ftc-1),0.5*ftc,1)
#temp_array = np.empty([])
temp_array = []
for w in q:
t = ftc*n + w #t is fine grid number
t = t*L/ftc #convert to fine grid position
if t < 0: #drop t when it would be less than zero
t = 0
temp_array.append(t) #add to array
else:
t = N_obs_photon(t) #take through function
temp_array.append(t) #add to array
y = sum(temp_array)/len(temp_array) #average the array
print(y) #test if y is a number
#store result in y array
I am trying to run all the elements in just_test_data to all the elements in just_train_data, and return the lowest number, then run the new just_test_data through all the just_train_data, and so on until all the just_test_data has been run.
The error I keep getting is in the line
step_1 = (abs(just_test_data[i] - just_train_data[n]) ** 2)
IndexError: arrays used as indices must be of integer (or boolean) type
When I first try to run the loop.
import numpy as np
testing_data = np.genfromtxt("C:\Users\zkrumlinde\Desktop\Statistical Programming\Week 3\iris-testing-data.csv", delimiter= ',')
training_data = np.genfromtxt("C:\Users\zkrumlinde\Desktop\Statistical Programming\Week 3\iris-training-data.csv", delimiter= ',')
#create 4 arrays, the first two with the measurements of training and testing data
#the last two have the labels of each line
just_test_data = np.array(testing_data[:, 0:4])
just_train_data = np.array(training_data[:, 0:4])
testing_labels = np.array(testing_data[:, 4])
training_labels = np.array(training_data[:, 4])
n = 0
while n < len(just_train_data):
for i in just_test_data:
old_distance = 'inf'
step_1 = (abs(just_test_data[i] - just_train_data[n]) ** 2)
step_2 = sum(step_1)
new_distance = np.sqrt(step_2)
if new_distance < old_distance:
old_distance = new_distance
index = n
n = n + 1
print(training_labels[index])
By using for i in just_test_data you're iterating through all the elements in the just_test_data array and not and index between 0 and the array length.
Also, it seems that your n = n + 1 line is not indented correctly.
Here's my guess for an updated version of your code:
import numpy as np
testing_data = np.genfromtxt("C:\Users\zkrumlinde\Desktop\Statistical Programming\Week 3\iris-testing-data.csv", delimiter= ',')
training_data = np.genfromtxt("C:\Users\zkrumlinde\Desktop\Statistical Programming\Week 3\iris-training-data.csv", delimiter= ',')
#create 4 arrays, the first two with the measurements of training and testing data
#the last two have the labels of each line
just_test_data = np.array(testing_data[:, 0:4])
just_train_data = np.array(training_data[:, 0:4])
testing_labels = np.array(testing_data[:, 4])
training_labels = np.array(training_data[:, 4])
n = 0
while n < len(just_train_data):
for i in range(len(just_test_data)):
old_distance = 'inf'
step_1 = (abs(just_test_data[i] - just_train_data[n]) ** 2)
step_2 = sum(step_1)
new_distance = np.sqrt(step_2)
if new_distance < old_distance:
old_distance = new_distance
index = n
n = n + 1
print(training_labels[index])
when you say for i in just_test_data: i will be the element itself, not the index.
you probably want something like for i in range(len(just_test_data)) this will have i as a number from 0 to the length of just_test_data - 1.
edit: a few weird things in your code:
step_1 = (abs(just_test_data[i] - just_train_data[n]) ** 2)
step_2 = sum(step_1)
new_distance = np.sqrt(step_2)
this just returns abs(just_test_data[i] - just_train_data[n]). are you meaning to add a ton of step_1 up and then eventually take the sqrt? you need to check your indents.
old_distance = 'inf' is a string (pretty sure). you are probably looking for either np.inf or float('inf'). Also because you set this inside the for loop, it is getting reset for every i. you probably want it above 'for i in just_test_data:'
a quick pass at your code:
min_distance = np.inf
for n in range(len(just_train_data)):
step_2 = 0
for i in range(len(just_test_data)):
step_1 = (just_test_data[i] - just_train_data[n]) ** 2
step_2 += step_1
distance = np.sqrt(step_2)
if distance < min_distance:
min_distance = distance
index = n
print(training_labels[index])
This compares a point in just_train_data to all the points in just_test_data to compute a distance. It will print the minimum of these distances.
This code below is suppose to run a bayes classifier for a full covariance gaussian (http://courses.ee.sun.ac.za/Pattern_Recognition_813/lectures/lecture03/node2.html), but I get two errors when I run the code. They are:
RuntimeWarning: Mean of empty slice.
warnings.warn("Mean of empty slice.", RuntimeWarning)
and
RuntimeWarning: Degrees of freedom <= 0 for slice
warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
This is my code:
def modelFull(train, test):
err_train = 0
err_test = 0
x_train = []
x_test = []
labels = []
train_labels = []
test_labels = []
for i in train:
x_train.append(i[:-1]/255)
labels.append(i[-1])
train_labels.append(i[-1])
for i in test:
x_test.append(i[:-1]/255)
labels.append(i[-1])
test_labels.append(i[-1])
x_train = np.array(x_train)
x_0 = []
x_1 = []
for i in train:
if i[-1] == 0:
x_0.append(i[:-1]/255)
if i[-1] == 1:
x_1.append(i[:-1]/255)
x_0 = np.array(x_0)
x_1 = np.array(x_1)
p_0 = float(x_0.shape[0])/float((x_0.shape[0]+x_1.shape[0]))
p_1 = float(x_1.shape[0])/float((x_0.shape[0]+x_1.shape[0]))
train_x0_mean = x_0.mean(axis=0)
train_x1_mean = x_1.mean(axis=0)
cov_x0 = np.cov(np.transpose(x_0))
cov_x1 = np.cov(np.transpose(x_1))
cov_x0 = cov_x0 + np.eye(256) * .01
cov_x1 = cov_x1 + np.eye(256) * .01
det_x1_cov = -float(np.linalg.slogdet(cov_x1)[1])
det_x0_cov = -float(np.linalg.slogdet(cov_x0)[1])
train_results = []
test_results = []
for x in x_train:
x0_minus_mu_T = np.transpose((x-train_x0_mean))
x0_inverse = np.linalg.inv(cov_x0)
x0_minus_mu = x-train_x0_mean
x1_minus_mu_T = np.transpose((x-train_x1_mean))
x1_inverse = np.linalg.inv(cov_x1)
x1_minus_mu = x-train_x1_mean
x_0_probability = det_x0_cov - (x0_minus_mu_T.dot(x0_inverse)).dot(x0_minus_mu)
x_1_probability = det_x1_cov - (x1_minus_mu_T.dot(x1_inverse)).dot(x1_minus_mu)
if (x_0_probability+np.log(p_0))/(x_1_probability+np.log(p_1)) < 1:
train_results.append(1)
else:
train_results.append(0)
for x in x_test:
x0_minus_mu_T = np.transpose((x-train_x0_mean))
x0_inverse = np.linalg.inv(cov_x0)
x0_minus_mu = x-train_x0_mean
x1_minus_mu_T = np.transpose((x-train_x1_mean))
x1_inverse = np.linalg.inv(cov_x1)
x1_minus_mu = x-train_x1_mean
x_0_probability = det_x0_cov - (x0_minus_mu_T.dot(x0_inverse)).dot(x0_minus_mu)
x_1_probability = det_x1_cov - (x1_minus_mu_T.dot(x1_inverse)).dot(x1_minus_mu)
if (x_0_probability+np.log(p_0))/(x_1_probability+np.log(p_1)) < 1:
test_results.append(1)
else:
test_results.append(0)
train_correct = 0
test_correct = 0
for i in range(len(train_results)):
if int(train_results[i]) == int(train_labels[i]):
train_correct +=1
for i in range(len(test_results)):
if int(test_results[i]) == int(test_labels[i]):
test_correct +=1
err_train = 1-(float(test_correct)/ len(test_results))
err_train = 1-(float(train_correct)/ len(train_results))
return err_train, err_test
RuntimeWarning: Degrees of freedom <= 0 for slice
occurs when you use the wrong shape, e.g.:
import numpy as np
x = np.random.random([1000,1])
y = np.random.random([1000,1])
print(x.shape, y.shape)
# (1000, 1) (1000, 1)
t = np.cov(x, y) #RuntimeWarning
t = np.cov(x.T, y.T) #This works
An edge case is: the array you calculate covariance on only contains one element.
np.cov([0.5])
In addition to the use of wrong shape mentioned above, using np.nanstd across all NaNs array will also invoke the message of "RuntimeWarning: Degrees of freedom <= 0 for slice"; using np.nanmean across all NaNs array will invoke the message of "RuntimeWarning: Mean of empty slice.".