Geoviews FilledContours: keeping filled colours but removing countour lines - python

I would like to plot something that resembles a kdeplot using geoviews without actually plotting the contour lines. The geoplot library supports something like this:
How can I make such a plot in geoviews?
Here is a very basic example of the kind of kdeplot I am managing to generate via geoviews, which by default plots the black lines that separates different intensities:
import geoviews.tile_sources as gts
import geoviews as gv
import numpy as np
from sklearn.neighbors import KernelDensity
gv.extension('bokeh')
np.random.seed(2021)
# Define extent of GPS coordinates
xmean = -12.015358
ymean = -76.990665
xmin, xmax = xmean*0.9, xmean*1.1
ymin, ymax = ymean*0.9, ymean*1.1
xrange = np.linspace(xmin, xmax, num=1000)
yrange = np.linspace(ymin, ymax, num=1000)
# Sample GPS coordinates
latlon = np.vstack([np.random.choice(xrange, 100), np.random.choice(yrange, 100)]).T
# Fit a gaussian kernel
kde = KernelDensity(bandwidth=0.03)
kde.fit(latlon)
# Apply gaussian kernel on grid
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
Z = kde.score_samples(positions.T).reshape(X.shape)
# Define Map
kde_plot = gv.FilledContours((Y, X, Z)).opts(cmap='PuBu', fill_alpha=0.5)
background_plot = gts.CartoLight
geomap = (kde_plot * background_plot).opts(width=800, height=550, xaxis=None, yaxis=None)
geomap
I could not find any parameter settings in gv.FilledCountours that remove these lines.

The argument you have to use is line_color and in your case you want to set it to None.
Applying the change to this line of code
kde_plot = gv.FilledContours((Y, X, Z)).opts(cmap='PuBu', fill_alpha=0.5, line_color=None)
you will get this plot as a return.

Related

What could be causing incorrect 2-D interpolation in SciPy?

I have a rectilinear (not regular) grid of data (x,y,V) where V is the value at the position (x,y). I would like to use this data source to interpolate my results so that I can fill in the gaps and plot the interpolated values (inside the range) in the future. (Also I need functionality of griddata to check arbitrary values inside the range).
I looked at the documentation at SciPy and here.
Here is what I tried, and the result:
It clearly doesn't match the data.
# INTERPOLATION ATTEMPT?
from scipy.interpolate import Rbf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
edges = np.linspace(-0.05, 0.05, 100)
centers = edges[:-1] + np.diff(edges[:2])[0] / 2.
XI, YI = np.meshgrid(centers, centers)
# use RBF
rbf = Rbf(x, y, z, epsilon=2)
ZI = rbf(XI, YI)
# plot the result
plt.subplots(1,figsize=(12,8))
X_edges, Y_edges = np.meshgrid(edges, edges)
lims = dict(cmap='viridis')
plt.pcolormesh(X_edges, Y_edges, ZI, shading='flat', **lims)
plt.scatter(x, y, 200, z, edgecolor='w', lw=0.1, **lims)
#decoration
plt.title('RBF interpolation?')
plt.xlim(-0.05, 0.05)
plt.ylim(-0.05, 0.05)
plt.colorbar()
plt.show()
For reference, here is my data (extracted), it has a circular pattern that I need interpolation to recognize.
#DATA
experiment1raw = np.array([
[0,40,1,11.08,8.53,78.10,2.29],
[24,-32,2,16.52,11.09,69.03,3.37],
[8,-32,4,14.27,10.68,71.86,3.19],
[-8,-32,6,10.86,9.74,76.69,2.72],
[-24,-32,8,6.72,12.74,77.08,3.45],
[32,-24,9,18.49,13.67,64.32,3.52],
[-32,-24,17,6.72,12.74,77.08,3.45],
[16,-16,20,13.41,21.33,59.92,5.34],
[0,-16,22,12.16,14.67,69.04,4.12],
[-16,-16,24,9.07,13.37,74.20,3.36],
[32,-8,27,19.35,17.88,57.86,4.91],
[-32,-8,35,6.72,12.74,77.08,3.45],
[40,0,36,19.25,20.36,54.97,5.42],
[16,0,39,13.41,21.33,59.952,5.34],
[0,0,41,10.81,19.55,64.37,5.27],
[-16,0,43,8.21,17.83,69.34,4.62],
[-40,0,46,5.76,13.43,77.23,3.59],
[32,8,47,15.95,23.61,54.34,6.10],
[-32,8,55,5.97,19.09,70.19,4.75],
[16,16,58,11.27,26.03,56.36,6.34],
[0,16,60,9.19,24.94,60.06,5.79],
[-16,16,62,7.10,22.75,64.57,5.58],
[32,24,65,12.39,29.19,51.17,7.26],
[-32,24,73,5.40,24.55,64.33,5.72],
[24,32,74,10.03,31.28,50.96,7.73],
[8,32,76,8.68,30.06,54.34,6.92],
[-8,32,78,6.88,28.78,57.84,6.49],
[-24,32,80,5.83,26.70,61.00,6.46],
[0,-40,81,7.03,31.55,54.40,7.01],
])
#Atomic Percentages are set here
Cr1 = experiment1raw[:,3]
Mn1 = experiment1raw[:,4]
Fe1 = experiment1raw[:,5]
Co1 = experiment1raw[:,6]
#COORDINATE VALUES IN PRE-T
x_pret = experiment1raw[:,0]/1000
y_pret = experiment1raw[:,1]/1000
#important translation
x = -y_pret
y = -x_pret
z = Cr1
You used a larger epsilon in RBF. Best bet is to set it as default and let scipy calculate an appropriate value. See the implementation here.
So setting default epsilon:
rbf = Rbf(x, y, z)
I got a pretty good interpolation for your data (subjective opinion).

Mayavi: How can I flip the Z-axis of a scene?

I am trying to flip Z-axis in the mayavi volumetric 3D plot. I figured how to rotate the camera etc, but that is not what I want. I just want to flip the direction of Z-axis. Without manipulating the data itself
#Minimum working example
import numpy as np
from mayavi import mlab
x, y, z = np.ogrid[-5:5:64j, -5:5:64j, -5:5:64j] #Generate XYZ
data = np.arange(x.shape[0])
x = x.ravel()
y = y.ravel()
z = z.ravel()
mlab.points3d(x, y, z, data) #Produce volumetric plot
mlab.axes(xlabel='X', ylabel='Y', zlabel='Z') #Display axis
mlab.orientation_axes()
mlab.show()
Could you please explain what you mean using non-symmetric data using this example.
Do you want negative z to be at the top side?
And why does rotating the camera not produce the result you want to see?
You can add the code from the macro editor (explained below).
import numpy as np
from mayavi import mlab
x, y, z = np.ogrid[-5:5:64j, -5:5:64j, -5:5:64j] #Generate XYZ
data = np.arange(x.shape[0])
x = x.ravel()
y = y.ravel()
z = z.ravel()
# Recorded script from Mayavi2
from numpy import array
try:
engine = mayavi.engine
except (AttributeError, NameError):
from mayavi.api import Engine
engine = Engine()
engine.start()
if len(engine.scenes) == 0:
engine.new_scene()
# -------------------------------------------
scene = engine.scenes[0]
scene.scene.camera.position = [20.68813263960946, 20.334388554161922, 20.518300376103046]
scene.scene.camera.focal_point = [0.24373197555541992, 0.24373197555541992, 0.25]
scene.scene.camera.view_angle = 30.0
scene.scene.camera.view_up = [-0.41179533881878827, -0.4046701524210215, 0.81649658092772626]
scene.scene.camera.clipping_range = [15.729834995160559, 58.864284541884331]
scene.scene.camera.compute_view_plane_normal()
scene.scene.render()
mlab.points3d(x, y, z, data) #Produce volumetric plot
mlab.axes(xlabel='X', ylabel='Y', zlabel='Z') #Display axis
mlab.orientation_axes()
mlab.show()
If you really can set the view you want manually I would just do that.
To get the correct coordinates to pass to mlab.view() read them from the interactive plot while rotating the scene:

Colormap with colored quiver

I am plotting a map with arrows on top of it. These arrows represent winddirections, average windspeed (per direction) and the occurence (per direction).
The direction is indicated by the direction of the arrow. The length of the arrow indicated the average windspeed in that direction. The color of the arrow indicates the occurence of winds in such a direction.
This all works fine with the script below:
windData = pd.read_csv(src+'.txt'), sep='\t', names=['lat', 'lon', 'wind_dir_start', 'wind_dir_end', 'total_num_data_points','num_data_points', 'avg_windspeed']).dropna()
# plot map
m = Basemap(llcrnrlon=minLon, llcrnrlat=minLat, urcrnrlon=maxLon, urcrnrlat=maxLat, resolution='i')
Left, Bottom = m(minLon, minLat)
Right, Top = m(maxLon, maxLat)
# get x y
x, y = m(windData['lon'], windData['lat'])
# angles
angleStart = -windData['wind_start']+90
angleStart[angleStart<0] = np.radians(angleStart[angleStart<0]+360.)
angleEnd = -windData['wind_end']+90
angleEnd[angleEnd<0] = np.radians(angleEnd[angleEnd<0]+360.)
angle = angleStart + math.radians(binSize/2.)
xux = np.cos(angle) * windData['avg_windspeed']
yuy = np.sin(angle) * windData['avg_windspeed']
# occurence
occurence = (windData['num_data_points']/windData['total_num_data_points'])
xi = np.linspace(minLon, maxLon, 300)
yi = np.linspace(minLat, maxLat, 300)
# plotting
## xux and yuy are used negatively because they are measured as "coming from" and displayed as "going to"
# To make things more readable I left a threshold for the occurence out
# I usually plot x, y, xux, yuy and the colors as var[occurence>threshold]
Q = m.quiver(x, y, -xux, -yuy, scale=75, zorder=6, color=cm.jet, width=0.0003*Width, cmap=cm.jet)
qk = plt.quiverkey(Q, 0.5, 0.92, 3, r'$3 \frac{m}{s}$', labelpos='S', fontproperties={'weight': 'bold'})
m.scatter(x, y, c='k', s=20*np.ones(len(x)), zorder=10, vmin=4.5, vmax=39.)
This plot shows the arrows well, but now I want to add a colormap that indicates the percentage of occurence next to the plot. How would I do this?
OK
Usual imports, plus import matplotlib
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
Fake the data to be plotted (tx for the MCVE)
NP = 10
np.random.seed(1)
x = np.random.random(NP)
y = np.random.random(NP)
angle = 1.07+np.random.random(NP) # NE to NW
velocity = 1.50+np.random.random(NP)
o = np.random.random(NP)
occurrence = o/np.sum(o)
dx = np.cos(angle)*velocity
dy = np.sin(angle)*velocity
Create a mappable so that Matplotib has no reason to complain "RuntimeError: No mappable was found to use for colorbar creation."
norm = matplotlib.colors.Normalize()
norm.autoscale(occurrence)
cm = matplotlib.cm.copper
sm = matplotlib.cm.ScalarMappable(cmap=cm, norm=norm)
sm.set_array([])
and plot the data
plt.quiver(x, y, dx, dy, color=cm(norm(o)))
plt.colorbar(sm)
plt.show()
References:
A logarithmic colorbar in matplotlib scatter plot
,
Drawing a colorbar aside a line plot, using Matplotlib
and
Different colours for arrows in quiver plot.
P.S. In recent (for sure in 3.+) Matplotlib releases the cm.set_array incantation is no more necessary
Do you want the colorbar to show the different wind speeds? If so, it might be sufficient to place plt.colorbar() between the lines Q = m.quiver(...) and qk = ....

Quiver basemap plot - how to make the quiver density latitude-dependent?

I want to do a quiver plot on a polar basemap plot. I have a regular lat/lon grid, and because there are more grid boxes at the higher latitudes, my code plots as many quivers at the pole as on the equator, so they overlap etc. How can I make the density of quivers latitude-dependent?
This is the code I use
import numpy as np
from mpl_toolkits.basemap import Basemap, addcyclic
import matplotlib.pyplot as plt
m_mu = Basemap(projection='npstere',boundinglat=10,lon_0=0,resolution='l',round=True)
lats=np.arange(0.,91.,15.)
lons=np.arange(-180.,181.,30.)
valin_u=np.array([[np.random.randn() for y in range(len(lons))] for x in range(len(lats))])
valin_v=np.array([[np.random.randn() for y in range(len(lons))] for x in range(len(lats))])
del x,y
valin = np.sqrt( valin_u**2 + valin_v**2 )
mu_cyclic, lons_cyclic = addcyclic(valin, lons)
mu_cyclic_u, lons_cyclic = addcyclic(valin_u, lons)
mu_cyclic_v, lons_cyclic = addcyclic(valin_v, lons)
grid = np.meshgrid( lons_cyclic, lats )
x, y = m_mu( *grid)
plt.figure()
cs = m_mu.pcolormesh(x, y, mu_cyclic)
csquiv = m_mu.quiver(x[::1,::1], y[::1,::1], mu_cyclic_u[::1,::1], mu_cyclic_v[::1,::1])
plt.show()
I guess I could write a function to set a latitude-dependent number of the values to .nan, but hopefully there is a better solution?
Many thanks for your help :-)
Sabine

Random Number from Histogram

Suppose I create a histogram using scipy/numpy, so I have two arrays: one for the bin counts, and one for the bin edges. If I use the histogram to represent a probability distribution function, how can I efficiently generate random numbers from that distribution?
It's probably what np.random.choice does in #Ophion's answer, but you can construct a normalized cumulative density function, then choose based on a uniform random number:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
data = np.random.normal(size=1000)
hist, bins = np.histogram(data, bins=50)
bin_midpoints = bins[:-1] + np.diff(bins)/2
cdf = np.cumsum(hist)
cdf = cdf / cdf[-1]
values = np.random.rand(10000)
value_bins = np.searchsorted(cdf, values)
random_from_cdf = bin_midpoints[value_bins]
plt.subplot(121)
plt.hist(data, 50)
plt.subplot(122)
plt.hist(random_from_cdf, 50)
plt.show()
A 2D case can be done as follows:
data = np.column_stack((np.random.normal(scale=10, size=1000),
np.random.normal(scale=20, size=1000)))
x, y = data.T
hist, x_bins, y_bins = np.histogram2d(x, y, bins=(50, 50))
x_bin_midpoints = x_bins[:-1] + np.diff(x_bins)/2
y_bin_midpoints = y_bins[:-1] + np.diff(y_bins)/2
cdf = np.cumsum(hist.ravel())
cdf = cdf / cdf[-1]
values = np.random.rand(10000)
value_bins = np.searchsorted(cdf, values)
x_idx, y_idx = np.unravel_index(value_bins,
(len(x_bin_midpoints),
len(y_bin_midpoints)))
random_from_cdf = np.column_stack((x_bin_midpoints[x_idx],
y_bin_midpoints[y_idx]))
new_x, new_y = random_from_cdf.T
plt.subplot(121, aspect='equal')
plt.hist2d(x, y, bins=(50, 50))
plt.subplot(122, aspect='equal')
plt.hist2d(new_x, new_y, bins=(50, 50))
plt.show()
#Jaime solution is great, but you should consider using the kde (kernel density estimation) of the histogram. A great explanation why it's problematic to do statistics over histogram, and why you should use kde instead can be found here
I edited #Jaime's code to show how to use kde from scipy. It looks almost the same, but captures better the histogram generator.
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def run():
data = np.random.normal(size=1000)
hist, bins = np.histogram(data, bins=50)
x_grid = np.linspace(min(data), max(data), 1000)
kdepdf = kde(data, x_grid, bandwidth=0.1)
random_from_kde = generate_rand_from_pdf(kdepdf, x_grid)
bin_midpoints = bins[:-1] + np.diff(bins) / 2
random_from_cdf = generate_rand_from_pdf(hist, bin_midpoints)
plt.subplot(121)
plt.hist(data, 50, normed=True, alpha=0.5, label='hist')
plt.plot(x_grid, kdepdf, color='r', alpha=0.5, lw=3, label='kde')
plt.legend()
plt.subplot(122)
plt.hist(random_from_cdf, 50, alpha=0.5, label='from hist')
plt.hist(random_from_kde, 50, alpha=0.5, label='from kde')
plt.legend()
plt.show()
def kde(x, x_grid, bandwidth=0.2, **kwargs):
"""Kernel Density Estimation with Scipy"""
kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1), **kwargs)
return kde.evaluate(x_grid)
def generate_rand_from_pdf(pdf, x_grid):
cdf = np.cumsum(pdf)
cdf = cdf / cdf[-1]
values = np.random.rand(1000)
value_bins = np.searchsorted(cdf, values)
random_from_cdf = x_grid[value_bins]
return random_from_cdf
Perhaps something like this. Uses the count of the histogram as a weight and chooses values of indices based on this weight.
import numpy as np
initial=np.random.rand(1000)
values,indices=np.histogram(initial,bins=20)
values=values.astype(np.float32)
weights=values/np.sum(values)
#Below, 5 is the dimension of the returned array.
new_random=np.random.choice(indices[1:],5,p=weights)
print new_random
#[ 0.55141614 0.30226256 0.25243184 0.90023117 0.55141614]
I had the same problem as the OP and I would like to share my approach to this problem.
Following Jaime answer and Noam Peled answer I've built a solution for a 2D problem using a Kernel Density Estimation (KDE).
Frist, let's generate some random data and then calculate its Probability Density Function (PDF) from the KDE. I will use the example available in SciPy for that.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
def measure(n):
"Measurement model, return two coupled measurements."
m1 = np.random.normal(size=n)
m2 = np.random.normal(scale=0.5, size=n)
return m1+m2, m1-m2
m1, m2 = measure(2000)
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax])
ax.plot(m1, m2, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
And the plot is:
Now, we obtain random data from the PDF obtained from the KDE, which is the variable Z.
# Generate the bins for each axis
x_bins = np.linspace(xmin, xmax, Z.shape[0]+1)
y_bins = np.linspace(ymin, ymax, Z.shape[1]+1)
# Find the middle point for each bin
x_bin_midpoints = x_bins[:-1] + np.diff(x_bins)/2
y_bin_midpoints = y_bins[:-1] + np.diff(y_bins)/2
# Calculate the Cumulative Distribution Function(CDF)from the PDF
cdf = np.cumsum(Z.ravel())
cdf = cdf / cdf[-1] # NormalizaĆ§Ć£o
# Create random data
values = np.random.rand(10000)
# Find the data position
value_bins = np.searchsorted(cdf, values)
x_idx, y_idx = np.unravel_index(value_bins,
(len(x_bin_midpoints),
len(y_bin_midpoints)))
# Create the new data
new_data = np.column_stack((x_bin_midpoints[x_idx],
y_bin_midpoints[y_idx]))
new_x, new_y = new_data.T
And we can calculate the KDE from this new data and the plot it.
kernel = stats.gaussian_kde(new_data.T)
new_Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(new_Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax])
ax.plot(new_x, new_y, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
Here is a solution, that returns datapoints that are uniformly distributed within each bin instead of the bin center:
def draw_from_hist(hist, bins, nsamples = 100000):
cumsum = [0] + list(I.np.cumsum(hist))
rand = I.np.random.rand(nsamples)*max(cumsum)
return [I.np.interp(x, cumsum, bins) for x in rand]
A few things do not work well for the solutions suggested by #daniel, #arco-bast, et al
Taking the last example
def draw_from_hist(hist, bins, nsamples = 100000):
cumsum = [0] + list(I.np.cumsum(hist))
rand = I.np.random.rand(nsamples)*max(cumsum)
return [I.np.interp(x, cumsum, bins) for x in rand]
This assumes that at least the first bin has zero content, which may or may not be true. Secondly, this assumes that the value of the PDF is at the upper bound of the bins, which it isn't - it's mostly in the centre of the bin.
Here's another solution done in two parts
def init_cdf(hist,bins):
"""Initialize CDF from histogram
Parameters
----------
hist : array-like, float of size N
Histogram height
bins : array-like, float of size N+1
Histogram bin boundaries
Returns:
--------
cdf : array-like, float of size N+1
"""
from numpy import concatenate, diff,cumsum
# Calculate half bin sizes
steps = diff(bins) / 2 # Half bin size
# Calculate slope between bin centres
slopes = diff(hist) / (steps[:-1]+steps[1:])
# Find height of end points by linear interpolation
# - First part is linear interpolation from second over first
# point to lowest bin edge
# - Second part is linear interpolation left neighbor to
# right neighbor up to but not including last point
# - Third part is linear interpolation from second to last point
# over last point to highest bin edge
# Can probably be done more elegant
ends = concatenate(([hist[0] - steps[0] * slopes[0]],
hist[:-1] + steps[:-1] * slopes,
[hist[-1] + steps[-1] * slopes[-1]]))
# Calculate cumulative sum
sum = cumsum(ends)
# Subtract off lower bound and scale by upper bound
sum -= sum[0]
sum /= sum[-1]
# Return the CDF
return sum
def sample_cdf(cdf,bins,size):
"""Sample a CDF defined at specific points.
Linear interpolation between defined points
Parameters
----------
cdf : array-like, float, size N
CDF evaluated at all points of bins. First and
last point of bins are assumed to define the domain
over which the CDF is normalized.
bins : array-like, float, size N
Points where the CDF is evaluated. First and last points
are assumed to define the end-points of the CDF's domain
size : integer, non-zero
Number of samples to draw
Returns
-------
sample : array-like, float, of size ``size``
Random sample
"""
from numpy import interp
from numpy.random import random
return interp(random(size), cdf, bins)
# Begin example code
import numpy as np
import matplotlib.pyplot as plt
# initial histogram, coarse binning
hist,bins = np.histogram(np.random.normal(size=1000),np.linspace(-2,2,21))
# Calculate CDF, make sample, and new histogram w/finer binning
cdf = init_cdf(hist,bins)
sample = sample_cdf(cdf,bins,1000)
hist2,bins2 = np.histogram(sample,np.linspace(-3,3,61))
# Calculate bin centres and widths
mx = (bins[1:]+bins[:-1])/2
dx = np.diff(bins)
mx2 = (bins2[1:]+bins2[:-1])/2
dx2 = np.diff(bins2)
# Plot, taking care to show uncertainties and so on
plt.errorbar(mx,hist/dx,np.sqrt(hist)/dx,dx/2,'.',label='original')
plt.errorbar(mx2,hist2/dx2,np.sqrt(hist2)/dx2,dx2/2,'.',label='new')
plt.legend()
Sorry, I don't know how to get this to show up in StackOverflow, so copy'n'paste and run to see the point.
I stumbled upon this question when I was looking for a way to generate a random array based on a distribution of another array. If this would be in numpy, I would call it random_like() function.
Then I realized, I have written a package Redistributor which might do this for me even though the package was created with a bit different motivation (Sklearn transformer capable of transforming data from an arbitrary distribution to an arbitrary known distribution for machine learning purposes). Of course I understand unnecessary dependencies are not desired, but at least knowing this package might be useful to you someday. The thing OP asked about is basically done under the hood here.
WARNING: under the hood, everything is done in 1D. The package also implements multidimensional wrapper, but I have not written this example using it as I find it to be too niche.
Installation:
pip install git+https://gitlab.com/paloha/redistributor
Implementation:
import numpy as np
import matplotlib.pyplot as plt
def random_like(source, bins=0, seed=None):
from redistributor import Redistributor
np.random.seed(seed)
noise = np.random.uniform(source.min(), source.max(), size=source.shape)
s = Redistributor(bins=bins, bbox=[source.min(), source.max()]).fit(source.ravel())
s.cdf, s.ppf = s.source_cdf, s.source_ppf
r = Redistributor(target=s, bbox=[noise.min(), noise.max()]).fit(noise.ravel())
return r.transform(noise.ravel()).reshape(noise.shape)
source = np.random.normal(loc=0, scale=1, size=(100,100))
t = random_like(source, bins=80) # More bins more precision (0 = automatic)
# Plotting
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title(f'Distribution of source data, shape: {source.shape}')
plt.hist(source.ravel(), bins=100)
plt.subplot(122); plt.title(f'Distribution of generated data, shape: {t.shape}')
plt.hist(t.ravel(), bins=100); plt.show()
Explanation:
import numpy as np
import matplotlib.pyplot as plt
from redistributor import Redistributor
from sklearn.metrics import mean_squared_error
# We have some source array with "some unknown" distribution (e.g. an image)
# For the sake of example we just generate a random gaussian matrix
source = np.random.normal(loc=0, scale=1, size=(100,100))
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Source data'); plt.imshow(source, origin='lower')
plt.subplot(122); plt.title('Source data hist'); plt.hist(source.ravel(), bins=100); plt.show()
# We want to generate a random matrix from the distribution of the source
# So we create a random uniformly distributed array called noise
noise = np.random.uniform(source.min(), source.max(), size=(100,100))
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Uniform noise'); plt.imshow(noise, origin='lower')
plt.subplot(122); plt.title('Uniform noise hist'); plt.hist(noise.ravel(), bins=100); plt.show()
# Then we fit (approximate) the source distribution using Redistributor
# This step internally approximates the cdf and ppf functions.
s = Redistributor(bins=200, bbox=[source.min(), source.max()]).fit(source.ravel())
# A little naming workaround to make obj s work as a target distribution
s.cdf = s.source_cdf
s.ppf = s.source_ppf
# Here we create another Redistributor but now we use the fitted Redistributor s as a target
r = Redistributor(target=s, bbox=[noise.min(), noise.max()])
# Here we fit the Redistributor r to the noise array's distribution
r.fit(noise.ravel())
# And finally, we transform the noise into the source's distribution
t = r.transform(noise.ravel()).reshape(noise.shape)
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Transformed noise'); plt.imshow(t, origin='lower')
plt.subplot(122); plt.title('Transformed noise hist'); plt.hist(t.ravel(), bins=100); plt.show()
# Computing the difference between the two arrays
print('Mean Squared Error between source and transformed: ', mean_squared_error(source, t))
Mean Squared Error between source and transformed: 2.0574123162302143

Categories