Related
I would like to evaluate a 4d Gaussian / normal distribution on a 4d grid. Let's call the variables (x1,y1,x2,y2). Then if I have means = (x1=1,y1=0,x2=2,y2=0), I expect that when I do a 2d contour plot in the x1, x2 direction, at y1=y2=0, to see a Gaussian centered in (x1=1, x2=2). However, I see the mean/center at (x1=2,x2=0) instead.
What am I missing here? Is it how I define the grid to begin with?
For a 2d normal distribution it works as expected.
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import multivariate_normal
xy_min = -5
xy_max = 5
npoints = 50
x = np.linspace(xy_min, xy_max, npoints)
dim = 4
xx1,yy1,xx2,yy2 = np.meshgrid(x, x,x,x)
points = np.concatenate([xx1[:, :,:, :,None], yy1[:, :, :,:,None],xx2[:, :, :,:,None],yy2[:, :, :,:,None]], axis=-1)
cov = np.diag(np.ones(4))
mean=np.array([1,0,2,0])
rv = multivariate_normal.pdf(points , mean=mean, cov=cov)
plt.figure()
plt.contourf(x, x, rv[:,0,:,0])
I tried to manually reshape the evaluation points first, but it gives the same results. So I think I am missing something conceptually here?
points_resh = np.reshape(points,[npoints**4,dim],order='C')
rv_resh = multivariate_normal.pdf(points_resh , mean=mean, cov=cov)
rv2 = np.reshape(rv_resh,[npoints,npoints,npoints,npoints],order='C')
plt.figure()
plt.contourf(x, x, rv2[:,0,:,0])
** EDIT: SOLVED **
using ij indexing for meshgrid everything works as expected. Only need to keep in mind that the matrix needs to be transposed for contour plotting. See example below:
#%% Instead use ij indexing
x = np.linspace(-5, 5, 50)
y = np.linspace(-3, 3, 30)
z= np.linspace(-2, 2, 20)
w= np.linspace(-1, 1, 10)
x4d,y4d,z4d,w4d= np.meshgrid(x, y,z,w,indexing='ij')
points4d= np.concatenate([x4d[:, :,:,:,None], y4d[:, :,:,:,None], z4d[:, :,:,:,None],w4d[:, :,:,:,None]], axis=-1)
rv4d = multivariate_normal.pdf(points4d , mean=[1,0.0,2,0.0], cov=[0.1,0.1,0.1,0.1])
fig,ax=plt.subplots()
ax.contourf(x,z,rv4d[:,0,:,0].T)
ax.set(xlabel='x',ylabel='y')
print(x_mean)
using ij indexing for meshgrid everything works as expected. Only need to keep in mind that the matrix needs to be transposed for contour plotting. See example below:
#%% Instead use ij indexing
x = np.linspace(-5, 5, 50)
y = np.linspace(-3, 3, 30)
z= np.linspace(-2, 2, 20)
w= np.linspace(-1, 1, 10)
x4d,y4d,z4d,w4d= np.meshgrid(x, y,z,w,indexing='ij')
points4d= np.concatenate([x4d[:, :,:,:,None], y4d[:, :,:,:,None], z4d[:, :,:,:,None],w4d[:, :,:,:,None]], axis=-1)
rv4d = multivariate_normal.pdf(points4d , mean=[1,0.0,2,0.0], cov=[0.1,0.1,0.1,0.1])
fig,ax=plt.subplots()
ax.contourf(x,z,rv4d[:,0,:,0].T)
ax.set(xlabel='x',ylabel='y')
print(x_mean)
I am getting this error:
ValueError: could not broadcast input array from shape (4,1) into shape (4,)
from my script below, which is trying to integrate some differential equations using interpolated data from a dataset. It might seem slightly nonsensical, since the integrated data will be the same as simply interpolating, but in a bigger project this seemed to me to be the best way to access the rate of change of these variables inside the integration function.
from scipy.integrate import solve_ivp
import numpy as np
import scipy.interpolate
def interpolateVariables(VariableData,t):
FSH_interp = scipy.interpolate.interp1d(VariableData[:,4], VariableData[:,[0]], axis = 0)
LH_interp = scipy.interpolate.interp1d(VariableData[:,4], VariableData[:,[1]], axis = 0)
E2_interp = scipy.interpolate.interp1d(VariableData[:,4], VariableData[:,[2]], axis = 0)
P4_interp = scipy.interpolate.interp1d(VariableData[:,4], VariableData[:,[3]], axis = 0)
return FSH_interp(t), LH_interp(t), E2_interp(t), P4_interp(t)
def simulate_model(VariableData):
tspan = [0,24]
InitialValues = [0.1,0.1,0.1,0.1]
result = solve_ivp(fun = lambda t, y: func(t,y, VariableData), t_span = tspan, y0 = InitialValues, method = "RK45")
return result
def func(t, y, VariableData):
FSH_int, LH_int, E2_int, P4_int = interpolateVariables(VariableData, t)
dFSH = FSH_int - y[0]
dLH = LH_int - y[1]
dE2 = E2_int - y[2]
dP4 = P4_int - y[3]
dy = [dFSH, dLH, dE2, dP4]
print(dy)
return dy
Variable_data = np.array([[6.5000000e+00, 4.8000000e+00, 1.9721760e+01, 1.8870000e-01, 0.0000000e+00],
[6.2000000e+00, 4.1000000e+00, 2.9065080e+01, 1.8870000e-01, 2.0000000e+00],
[7.4000000e+00, 4.3000000e+00, 3.8353920e+01, 1.8870000e-01, 4.0000000e+00],
[6.1000000e+00, 4.9000000e+00, 4.8596160e+01, 1.8870000e-01, 6.0000000e+00],
[4.8000000e+00, 5.2000000e+00, 1.0830624e+02, 3.4595000e-01, 8.0000000e+00],
[3.6000000e+00, 6.0000000e+00, 1.8822840e+02, 2.2015000e-01, 1.0000000e+01],
[1.2900000e+01, 4.8300000e+01, 2.6142228e+02, 7.5480000e-01, 1.2000000e+01],
[6.3000000e+00, 7.2000000e+00, 6.4994640e+01, 3.7111000e+00, 1.4000000e+01],
[4.0000000e+00, 5.9000000e+00, 1.8024708e+02, 1.7769250e+01, 1.8000000e+01],
[3.2000000e+00, 5.3000000e+00, 2.0506272e+02, 1.8272450e+01, 2.0000000e+01],
[2.9000000e+00, 3.0000000e+00, 1.4941140e+02, 1.3680750e+01, 2.2000000e+01],
[3.4000000e+00, 4.8000000e+00, 8.6241840e+01, 3.1450000e+00, 2.4000000e+01]])
test = simulate_model(Variable_data)
I'm trying to make a fit using matplotlib.psd function. My datafile has 8 columns with displacement and speed for a particle (positionX, positionY, positionZ, AveragePositionXYZ, speedX, speedY, speedZ, AverageSpeedXYZ). Using the positionX for example, I try to get the Power Spectrum with matplotlib.psd:
power, freqs = plt.psd(data, len(data), Fs = 256, scale_by_freq=True, return_line=0)
Then, I try to make a curve fitting using linear regression with scipy stas.linregress:
slope, inter, r2, p, stderr = stats.linregress(x, y)
However, my results are very bad. I try to plot with:
line = (inter + slope * (10 * np.log10(freqs)))
plt.semilogx(freqs, line)
plt.show()
And get the following image:
I know that I have a lot of mistakes, and I try to get some solutions in the web. However, I have not had much success. So, I'm asking if there's someone here that could help me.
The datafile has the following format (first 10 lines):
1.50000000,0.00000000,0.00000000,0.50000000,0.00000000,0.00000000,0.00000000,0.00000000
1.49788889,0.00000000,0.00000000,0.49929630,-0.06333333,0.00000000,0.00000000,-0.02111111
1.49367078,0.00000005,0.00000000,0.49789028,-0.12654314,0.00000165,0.00000000,-0.04218050
1.48735391,0.00000027,0.00000000,0.49578473,-0.18950635,0.00000659,0.00000000,-0.06316659
1.47895054,0.00000082,0.00000000,0.49298379,-0.25210085,0.00001647,0.00000000,-0.08402813
1.46847701,0.00000192,0.00000000,0.48949298,-0.31420588,0.00003296,0.00000000,-0.10472431
1.45595360,0.00000385,0.00000000,0.48531915,-0.37570257,0.00005769,0.00000000,-0.12521496
1.44140445,0.00000692,0.00000000,0.48047046,-0.43647431,0.00009232,0.00000000,-0.14546066
1.42485754,0.00001154,0.00000000,0.47495636,-0.49640723,0.00013851,0.00000000,-0.16542291
1.40634452,0.00001814,0.00000000,0.46878755,-0.55539066,0.00019789,0.00000000,-0.18506426
My complete Python code is as follows:
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
filename = 'datafile.txt'
# Load data
file = np.genfromtxt(filename,
skip_header = 0,
skip_footer = 0,
delimiter = ',',
dtype = 'float32',
filling_values = 0,
usecols = (0, 1, 2, 3, 4, 5, 6, 7),
names = ['posX', 'posY', 'posZ', 'posMedias', 'velX', 'velY', 'velZ', 'velMedias'])
# Map values
posX = file['posX']
posY = file['posY']
posZ = file['posZ']
posMedia = file['posMedias']
velX = file['velX']
velY = file['velY']
velZ = file['velZ']
velMedia = file['velMedias']
# Column data that will be used
data = posMedia
# PSD calculation
power, freqs = plt.psd(data, len(data), Fs = 256, scale_by_freq=True, return_line=0)
# Linear fit
x = np.log10(freqs[1:])
y = np.log10(power[1:])
slope, inter, r2, p, stderr = stats.linregress(x, y)
print(slope, inter)
# Plot
line = (inter + slope * (10 * np.log10(freqs)))
plt.semilogx(freqs, line)
plt.show()
Thank you so much!
The griding the data (d) in irregular grid (x and y) using Scipy's griddata is timecomsuing when the datasets are many. But, the longitudes and latitudes (x and y) are always same, only the data (d) are changing. In this case, once using the giddata, how to repeat the procedure with different d arrys to achieve faster result?
import numpy as np, matplotlib.pyplot as plt
from scipy.interpolate import griddata
x = np.array([110, 112, 114, 115, 119, 120, 122, 124]).astype(float)
y = np.array([60, 61, 63, 67, 68, 70, 75, 81]).astype(float)
d = np.array([4, 6, 5, 3, 2, 1, 7, 9]).astype(float)
ulx, lrx = np.min(x), np.max(x)
uly, lry = np.max(y), np.min(y)
xi = np.linspace(ulx, lrx, 15)
yi = np.linspace(uly, lry, 15)
grided_data = griddata((x, y), d, (xi.reshape(1,-1), yi.reshape(-1,1)), method='nearest',fill_value=0)
plt.imshow(grided_data)
plt.show()
The above code works for one array of d.
But I have hundreds of other arrays.
griddata with nearest ends up using NearestNDInterpolator. That's a class that creates an iterator, which is called with the xi:
elif method == 'nearest':
ip = NearestNDInterpolator(points, values, rescale=rescale)
return ip(xi)
So you could create your own NearestNDInterpolator and call it with multiple times with different xi.
But I think in your case you want to change the values. Looking at the code for that class I see
self.tree = cKDTree(self.points)
self.values = y
the __call__ does:
dist, i = self.tree.query(xi)
return self.values[i]
I don't know the relative cost of creating the tree versus query.
So it should be easy to change values between uses of __call__. And it looks like values could have multiple columns, since it's just indexing on the 1st dimension.
This interpolator is simple enough that you could write your own using the same tree idea.
Here's a Nearest Interpolator that lets you repeat the interpolation for the same points, but different z values. I haven't done timings yet to see how much time it saves
class MyNearest(interpolate.NearestNDInterpolator):
# normal interpolation, but returns the near neighbor indices as well
def __call__(self, *args):
xi = interpolate.interpnd._ndim_coords_from_arrays(args, ndim=self.points.shape[1])
xi = self._check_call_shape(xi)
xi = self._scale_x(xi)
dist, i = self.tree.query(xi)
return i, self.values[i]
def my_griddata(points, values, method='linear', fill_value=np.nan,
rescale=False):
points = interpolate.interpnd._ndim_coords_from_arrays(points)
if points.ndim < 2:
ndim = points.ndim
else:
ndim = points.shape[-1]
assert(ndim==2)
# simplified call for 2d 'nearest'
ip = MyNearest(points, values, rescale=rescale)
return ip # ip(xi) # return iterator, not values
ip = my_griddata((xreg, yreg), z, method='nearest',fill_value=0)
print(ip)
xi = (xi.reshape(1,-1), yi.reshape(-1,1))
I, data = ip(xi)
print(data.shape)
print(I.shape)
print(np.allclose(z[I],data))
z1 = xreg+yreg # new z data
data = z1[I] # should show diagonal color bars
So as long as z has the same shape as before (and as xreg), z[I] will return the nearest value for each xi.
And it can interpolated 2d data as well (e.g. (225,n) shaped)
z1 = np.array([xreg+yreg, xreg-yreg]).T
print(z1.shape) # (225,2)
data = z1[I]
print(data.shape) # (20,20,2)
I am trying to plot an array of x and y values and keep getting this error.
ValueError: x and y must have same first dimension
This is my code:
import numpy as np
import pylab as plt
from matplotlib import rc
def analyze(targt_data, targt_data_name, trang_data, trang_data_name, matches):
"""Analyze a set of samples on target data"""
_timefrm = [40, 80, 120]
_scorefilter = 0.8
index = 0
matches = matches[np.where(matches[:, 3] > _scorefilter)]
# PLOTS
rc('text', usetex=True)
fig = plt.figure()
plt1 = fig.add_subplot(321)
plt1.hold(True)
plt2 = fig.add_subplot(322)
plt3 = fig.add_subplot(323)
plt4 = fig.add_subplot(324)
plt5 = fig.add_subplot(325)
plt6 = fig.add_subplot(326)
matches = matches[np.where(matches[:, 2] == index)]
avg_score = np.mean(matches[:, 3])
# PLOT SAMPLE
plt1.plot(trang_data[index])
rwresults = [targt_data[y-1:y+np.max(_timefrm)] for y in matches[:,1]]
pctresults = [np.log(np.divide(y[1:], y[0])) for y in rwresults]
for res in pctresults:
plt1.plot(np.arange(len(trang_data[index]),
len(trang_data[index])+np.max(_timefrm)),
np.dot(trang_data[index][-1], np.add(res, 1)))
plt.show()
results_name = raw_input('Load matching scores: ')
# #### LOAD MATCHING SCORES FROM DB
results, training_data_name, target_data_name = Results(DB).load_matching_scores(results_name)
# #### LOAD TARGET DATA AND TRAINING DATA
target_data = TargetData(DB).load(target_data_name)
training_data = TrainingData(DB).load(training_data_name)
# #### RUN ANALYSIS
analyze(target_data, target_data_name, training_data, training_data_name, results)
Also, here are the values printed out:
(Pdb) len(np.dot(trang_data[ns.index][-1], np.add(pctresults[0], 1)))
120
(Pdb) len(np.arange(len(trang_data[ns.index]), len(trang_data[ns.index])+np.max(_timefrm)))
120
(Pdb) np.dot(trang_data[ns.index][-1], np.add(pctresults[0], 1)).shape
(120,)
(Pdb) np.arange(len(trang_data[ns.index]), len(trang_data[ns.index])+np.max(_timefrm)).shape
(120,)
It turns out one of the subarrays was too short:
(Pdb) len(pctresults[71])
100
The value error "x and y must have same first dimension" is raised by the plot(x, y) method when x and y are not of the same length.