See the code. I am trying to simulate an array called 'xx'. Following the code, I am simulating each component in the matrix one by one. This is super time-consuming when M and N are very large. The reason why I do this is because in the function 'func', for each column data, the 'samplesize 'is different. Is there any faster way to generate this numpy array please? Thank you.
import numpy as np
def func ( x , n):
samplesize = np.random.poisson(x)
return np.random.uniform(0, 2, samplesize)
def singlerow(x, n ,N):
rowdata = np.zeros(N)
rowdata[0] = 0
for i in range(1, N):
rowdata[i] = rowdata[i-1] + np.sum(func(x, n))
return rowdata
def simulatematrix( x,n ,M, N):
result = np.zeros((M,N))
for m in range(M):
result[m] = singlerow(x,n, N)
return result
M = 100
N = 10
xx = simulatematrix(40, 10, M, N)
I have a code on python that tries to calculate the sample variance while I accumulate entries in each loop.
y_hat = y_df.loc[n-1]
var = []
var_sum = 0
for i in range(n):
var_i = (g_i[i] - y_hat)**2
var_sum += var_i
if i == 0:
var_avg = var_sum
var.append(var_avg)
else:
var_avg = var_sum/i
var.append(var_avg)
the output of the result yields very strange first rows (when i is 1), while the rest of rows are fine. Can someone help please?
This is the current output:
Below is my entire script, essentially I am testing Monte Carlo simulation to evaluate pi.
import numpy as np
import math
import matplotlib.pyplot as plt
import random
import pandas as pd
import statistics as stats
n = 1000
k = 100
# generate u r.v. with size k*n -> (100,1000)
u = []
for i in range(k):
u_i = np.random.uniform(size = n)
u.append(u_i)
# put into dataframe (k*n)
u_df = pd.DataFrame(u)
# calculate g_i, g_i is a df with k*n
g_i = 4*np.sqrt(1-u_df**2)
g_sum = 0
y = []
for i in range(n):
g_sum += g_i[i]
y_i = g_sum/(i+1)
y.append(y_i)
# put y into df -> n*k
y_df = pd.DataFrame(y)
y_df = y_df.reset_index(drop=True)
y_hat = y_df.loc[n-1]
var = []
var_sum = 0
for i in range(n):
var_i = (g_i[i] - y_hat)**2
var_sum += var_i
if i == 0:
var_avg = var_sum
var.append(var_avg)
else:
var_avg = var_sum/i
var.append(var_avg)
var_df = pd.DataFrame(var)
var_df = var_df.reset_index(drop=True)
var_df.head()
When var.append(var_avg) is run inside the i==0 if statement, you are appending var_sum by reference to var. Thus every time you change var_sum, you change var[0]. You can fix the problem by explicitly copying var_sum when i == 0. The corrected if-else statement would be
if i == 0:
var_avg = var_sum
var.append(var_avg.copy())
else:
var_avg = var_sum/(i+1)
var.append(var_avg)
This question already has answers here:
Python equivalent to 'hold on' in Matlab
(5 answers)
Closed 6 years ago.
This has been surprisingly difficult to find information on. I have two functions that I want to chart together, enumeration() and betterEnumeration()
import matplotlib.pyplot as plt
import time
import numpy as np
import sympy
from sympy import S, symbols
import random
from math import floor
def enumeration(array):
max = None
to_return = (max, 0, 0)
for i in range(0, len(array) + 1):
for j in range(0, i):
currentSum = 0
for k in range(j, i):
currentSum += array[k]
if (max is None) or (currentSum > max):
max = currentSum
to_return = (max, j, k)
return to_return
def betterEnumeration(array):
max = None
to_return = (max, 0, 0)
for i in range(1, len(array) + 1):
currentSum = 0
for j in range(i, len(array) + 1):
currentSum += array[j - 1]
if (max is None) or (currentSum > max):
max = currentSum
to_return = (max, i-1, j-1)
return to_return
I also have two helper functions randomArray() and regressionCurve().
def randomArray(totalNumbers,min,max):
array = []
while totalNumbers > 0:
array.append(random.randrange(min,max))
totalNumbers -= 1
return array
def regressionCurve(x,y):
# calculate polynomial
p = np.polyfit(x, y, 3)
f = np.poly1d(p)
# calculate new x's and y's
x_new = np.linspace(x[0], x[-1], 50)
y_new = f(x_new)
x = symbols("x")
poly = sum(S("{:6.5f}".format(v))*x**i for i, v in enumerate(p[::-1]))
eq_latex = sympy.printing.latex(poly)
plt.plot(x_new, y_new, label="${}$".format(eq_latex))
plt.legend(fontsize="small")
plt.show()
I want to plot both of these functions on the same chart, both the raw data points as well as the regression curves. The following code will chart the data points for enumeration() and then make a regression curve for them, but I'm not sure how to plot both enumeration() and betterEnumeration() on the same chart.
def chart():
nValues = [10,25,50,100,250,500,1000]
avgExecTimes = []
for n in nValues: # For each n value
totals = []
sum = 0
avgExecTime = 0
for i in range(0,10): # Create and test 10 random arrays
executionTimes = []
array = randomArray(n,0,10)
t1 = time.clock()
enumeration(array)
t2 = time.clock()
total = t2-t1
totals.append(total)
executionTimes.append(total)
print("Time elapsed(n=" + str(n) + "): " + str(total))
for t in totals: # Find avg running time for each n's 10 executions
sum += t
avgExecTime = sum/10
avgExecTimes.append(avgExecTime)
print("Avg execution time: " + str(avgExecTime))
# Chart execution times
plt.plot(nValues,avgExecTimes)
plt.ylabel('Seconds')
plt.xlabel('n')
plt.show()
# Chart curve that fits
x = np.array(nValues)
y = np.array(avgExecTimes)
regressionCurve(x,y)
To add a line to a plot:
plt.plot(x,y)
so, if you wanted to plot x1, y1 and then add x2,y2:
plt.plot(x1,y1)
plt.plot(x2,y2)
However, that's going to plot the second line in the default color. You're going to want to add a color component:
plt.plot(x1,y1, c='b')
plt.plot(x2,y2, c= 'g')
and if the units are different, you'll want to look into twinx, which will allow you to plot with 2 different y axes but the same x axis.
You're going to want to plot both sets of data from within the same function or both outside of the function. Otherwise, you're running into a local vs. global issue as well.
I am trying to optimize a snippet that gets called a lot (millions of times) so any type of speed improvement (hopefully removing the for-loop) would be great.
I am computing a correlation function of some j'th particle with all others
C_j(|r-r'|) = sqrt(E((s_j(r')-s_k(r))^2)) averaged over k.
My idea is to have a variable corrfun which bins data into some bins (the r, defined elsewhere). I find what bin of r each s_k belongs to and this is stored in ind. So ind[0] is the index of r (and thus the corrfun) for which the j=0 point corresponds to. Multiple points can fall into the same bin (in fact I want bins to be big enough to contain multiple points) so I sum together all of the (s_j(r')-s_k(r))^2 and then divide by number of points in that bin (stored in variable rw). The code I ended up making for this is the following (np is for numpy):
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
Note, the rw2 business was because I want to avoid divide by 0 problems but I do return the rw array and I want to be able to differentiate between the rw=0 and rw=1 elements. Perhaps there is a more elegant solution for this as well.
Is there a way to make the for-loop faster? While I would like to not add the self interaction (j==k) I am even ok with having self interaction if it means I can get significantly faster calculation (length of ind ~ 1E6 so self interaction is probably insignificant anyways).
Thank you!
Ilya
Edit:
Here is the full code. Note, in the full code I am averaging over j as well.
import numpy as np
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
print(r)
corrfun = r*0
rw = r*0
print(maxR)
''' go through all points'''
for j in range(0, n-1):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
return r, corrfun, rw
I debug test it the following way
from twopointcorr import twopointcorr
import numpy as np
import matplotlib.pyplot as plt
import time
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
print('running two point corr functinon')
start_time = time.time()
r,corrfun,rw = twopointcorr(x,y,s,0.1)
print("--- Execution time is %s seconds ---" % (time.time() - start_time))
fig1=plt.figure()
plt.plot(r, corrfun,'-x')
fig2=plt.figure()
plt.plot(r, rw,'-x')
plt.show()
Again, the main issue is that in the real dataset n~1E6. I can resample to make it smaller, of course, but I would love to actually crank through the dataset.
Here is the code that use broadcast, hypot, round, bincount to remove all the loops:
def twopointcorr2(x, y, s, dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
osub = lambda x:np.subtract.outer(x, x)
ind = np.clip(np.round(np.hypot(osub(x), osub(y)) / dr), 0, len(r)-1).astype(int)
rw = np.bincount(ind.ravel())
rw[0] -= len(x)
corrfun = np.bincount(ind.ravel(), (osub(s)**2).ravel())
return r, corrfun, rw
to compare, I modified your code as follows:
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
return r, corrfun, rw
and here is the code to check the results:
import numpy as np
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
r1, corrfun1, rw1 = twopointcorr(x,y,s,0.1)
r2, corrfun2, rw2 = twopointcorr2(x,y,s,0.1)
assert np.allclose(r1, r2)
assert np.allclose(corrfun1, corrfun2)
assert np.allclose(rw1, rw2)
and the %timeit results:
%timeit twopointcorr(x,y,s,0.1)
%timeit twopointcorr2(x,y,s,0.1)
outputs:
1 loop, best of 3: 5.16 s per loop
10 loops, best of 3: 134 ms per loop
Your original code on my system runs in about 5.7 seconds. I fully vectorized the inner loop and got it to run in 0.39 seconds. Simply replace your "go through all points" loop with this:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
# go through all points
for j in range(n): # n.b. previously n-1, not sure why
ind = inds[j]
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[ind[j]] -= 1 # subtract self
The first observation was that your hypot code was computing 2D distances, so I replaced that with cdist from SciPy to do it all in a single call. The second was that the inner for loop was slow, and thanks to an insightful comment from #hpaulj I vectorized that as well using np.add.at().
Since you asked how to vectorize the inner loop as well, I did that later. It now takes 0.25 seconds to run, for a total speedup of over 20x. Here's the final code:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
sn = np.tile(s, (n,1)) # n copies of s
diffs = (sn - sn.T)**2 # squares of pairwise differences
np.add.at(corrfun, inds, diffs)
rw = np.bincount(inds.flatten(), minlength=len(r))
np.subtract.at(rw, inds.diagonal(), 1) # subtract self
This uses more memory but does produce a substantial speedup vs. the single-loop version above.
Ok, so as it turns out outer products are incredibly memory expensive, however, using answers from #HYRY and #JohnZwinck i was able to make code that is still roughly linear in n in memory and computes fast (0.5 seconds for the test case)
import numpy as np
def twopointcorr(x,y,s,dr,maxR=-1):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
if maxR < dr:
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR+dr, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
ind = np.clip(np.round(np.hypot(x[j]-x,y[j]-y) / dr), 0, len(r)-1).astype(int)
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[0] -= n
corrfun = np.sqrt(np.divide(corrfun, np.maximum(rw,1)))
r=np.delete(r,-1)
rw=np.delete(rw,-1)
corrfun=np.delete(corrfun,-1)
return r, corrfun, rw
I am trying to implement particle filter algorithm in python. I am getting this error:
x_P_update[i] = 0.5*x_P[i] + 25*x_P[i]/(1 + x_P[i]**2) + 8*math.cos(1.2*(t-1)) + math.sqrt(x_N)*np.random.randn()
TypeError: 'float' object has no attribute '__getitem__'
My code:
import math
import numpy as np
import matplotlib.pyplot as plt
x = 0.1 #initial value
x_N = 1 #process noise covariance in state update
x_R = 1 #noise covariance in measurement
T = 75 #number of iterations
N = 10 #number of particles
V = 2
x_P = [None]*(N)
for i in xrange(0, N):
x_P[i] = x + math.sqrt(V)*np.random.randn()
z_out = np.array([x**2 / 20 + math.sqrt(x_R) * np.random.randn()]) #the actual output vector for measurement values.
x_out = np.array([x]) #the actual output vector for measurement values.
x_est = np.array([x]); # time by time output of the particle filters estimate
x_est_out = np.array([x_est]) # the vector of particle filter estimates.
x_P_update = [None]*N
z_update = [None]*N
P_w = [None]*N
for t in xrange(1, T+1):
x = 0.5*x + 25*x/(1 + x**2) + 8*math.cos(1.2*(t-1)) + math.sqrt(x_N)*np.random.randn()
z = x**2/20 + math.sqrt(x_R)*np.random.randn()
for i in xrange(0, N):
#each particle is updated with process eq
x_P_update[i] = 0.5*x_P[i] + 25*x_P[i]/(1 + x_P[i]**2) + 8*math.cos(1.2*(t-1)) + math.sqrt(x_N)*np.random.randn()
#observations are updated for each particle
z_update[i] = x_P_update[i]**2/20
#generate weights
P_w[i] = (1/math.sqrt(2*math.pi*x_R)) * math.exp(-(z - z_update[i])**2/(2*x_R))
P_w[:] = [ k / sum(P_w) for k in P_w]
# print(np.where(np.cumsum(P_w, axis=0) >= np.random.rand()))
# print(index_tuple[0][1])
# P_w_array = np.array(list(P_w))
# indices = [i for i in range(len(P_w)) if np.cumsum(P_w_array) >= np.random.rand()]
for i in xrange(0, N):
index_tuple = np.where(np.random.rand() <= np.cumsum(P_w, axis=0))
m = index_tuple[0][1]
x_P = x_P_update[m]
x_est = np.array([np.mean(x_P)])
x_out = np.array([x_out, x])
z_out = np.array([z_out, z])
x_est_out = np.array([x_est_out, x_est])
I am using matlab code from here to learn how to implement this algorithm in python using scipy. http://studentdavestutorials.weebly.com/particle-filter-with-matlab-code.html
I just started learning python and can't get out of this problem, kindly help.
I'm not going to go through the video tutorial and fix your algorithm, but I can show you why you're getting this error.
In this line:
x_P = x_P_update[m]
You are assigning an array with a float value, which you then attempt to access as an array in the outer loop. Updating it instead will get rid of your error:
x_P[m] = x_P_update[m]