I'd like to make a scatter plot where each point is colored by the spatial density of nearby points.
I've come across a very similar question, which shows an example of this using R:
R Scatter Plot: symbol color represents number of overlapping points
What's the best way to accomplish something similar in python using matplotlib?
In addition to hist2d or hexbin as #askewchan suggested, you can use the same method that the accepted answer in the question you linked to uses.
If you want to do that:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=100)
plt.show()
If you'd like the points to be plotted in order of density so that the densest points are always on top (similar to the linked example), just sort them by the z-values. I'm also going to use a smaller marker size here as it looks a bit better:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.show()
Plotting >100k data points?
The accepted answer, using gaussian_kde() will take a lot of time. On my machine, 100k rows took about 11 minutes. Here I will add two alternative methods (mpl-scatter-density and datashader) and compare the given answers with same dataset.
In the following, I used a test data set of 100k rows:
import matplotlib.pyplot as plt
import numpy as np
# Fake data for testing
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
Output & computation time comparison
Below is a comparison of different methods.
1: mpl-scatter-density
Installation
pip install mpl-scatter-density
Example code
import mpl_scatter_density # adds projection='scatter_density'
from matplotlib.colors import LinearSegmentedColormap
# "Viridis-like" colormap with white background
white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
(0, '#ffffff'),
(1e-20, '#440053'),
(0.2, '#404388'),
(0.4, '#2a788e'),
(0.6, '#21a784'),
(0.8, '#78d151'),
(1, '#fde624'),
], N=256)
def using_mpl_scatter_density(fig, x, y):
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density = ax.scatter_density(x, y, cmap=white_viridis)
fig.colorbar(density, label='Number of points per pixel')
fig = plt.figure()
using_mpl_scatter_density(fig, x, y)
plt.show()
Drawing this took 0.05 seconds:
And the zoom-in looks quite nice:
2: datashader
Datashader is an interesting project. It has added support for matplotlib in datashader 0.12.
Installation
pip install datashader
Code (source & parameterer listing for dsshow):
import datashader as ds
from datashader.mpl_ext import dsshow
import pandas as pd
def using_datashader(ax, x, y):
df = pd.DataFrame(dict(x=x, y=y))
dsartist = dsshow(
df,
ds.Point("x", "y"),
ds.count(),
vmin=0,
vmax=35,
norm="linear",
aspect="auto",
ax=ax,
)
plt.colorbar(dsartist)
fig, ax = plt.subplots()
using_datashader(ax, x, y)
plt.show()
It took 0.83 s to draw this:
There is also possibility to colorize by third variable. The third parameter for dsshow controls the coloring. See more examples here and the source for dsshow here.
3: scatter_with_gaussian_kde
def scatter_with_gaussian_kde(ax, x, y):
# https://stackoverflow.com/a/20107592/3015186
# Answer by Joel Kington
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
ax.scatter(x, y, c=z, s=100, edgecolor='')
It took 11 minutes to draw this:
4: using_hist2d
import matplotlib.pyplot as plt
def using_hist2d(ax, x, y, bins=(50, 50)):
# https://stackoverflow.com/a/20105673/3015186
# Answer by askewchan
ax.hist2d(x, y, bins, cmap=plt.cm.jet)
It took 0.021 s to draw this bins=(50,50):
It took 0.173 s to draw this bins=(1000,1000):
Cons: The zoomed-in data does not look as good as in with mpl-scatter-density or datashader. Also you have to determine the number of bins yourself.
5: density_scatter
The code is as in the answer by Guillaume.
It took 0.073 s to draw this with bins=(50,50):
It took 0.368 s to draw this with bins=(1000,1000):
Also, if the number of point makes KDE calculation too slow, color can be interpolated in np.histogram2d [Update in response to comments: If you wish to show the colorbar, use plt.scatter() instead of ax.scatter() followed by plt.colorbar()]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs ) :
"""
Scatter plot colored by 2d histogram
"""
if ax is None :
fig , ax = plt.subplots()
data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False)
#To be sure to plot all data
z[np.where(np.isnan(z))] = 0.0
# Sort the points by density, so that the densest points are plotted last
if sort :
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
ax.scatter( x, y, c=z, **kwargs )
norm = Normalize(vmin = np.min(z), vmax = np.max(z))
cbar = fig.colorbar(cm.ScalarMappable(norm = norm), ax=ax)
cbar.ax.set_ylabel('Density')
return ax
if "__main__" == __name__ :
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
density_scatter( x, y, bins = [30,30] )
You could make a histogram:
import numpy as np
import matplotlib.pyplot as plt
# fake data:
a = np.random.normal(size=1000)
b = a*3 + np.random.normal(size=1000)
plt.hist2d(a, b, (50, 50), cmap=plt.cm.jet)
plt.colorbar()
Related
I need to plot a HEATMAP in python using x, y, z data from the excel file.
All the values of z are 1 except at (x=5,y=5). The plot should be red at point (5,5) and blue elsewhere. But I am getting false alarms which need to be removed. The COLORMAP I have used is 'jet'
X=[0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9]
Y=[0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9]
Z=[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
Code I have used is:
import matplotlib.pyplot as plt
import numpy as np
from numpy import ravel
from scipy.interpolate import interp2d
import pandas as pd
import matplotlib as mpl
excel_data_df = pd.read_excel('test.xlsx')
X= excel_data_df['x'].tolist()
Y= excel_data_df['y'].tolist()
Z= excel_data_df['z'].tolist()
x_list = np.array(X)
y_list = np.array(Y)
z_list = np.array(Z)
# f will be a function with two arguments (x and y coordinates),
# but those can be array_like structures too, in which case the
# result will be a matrix representing the values in the grid
# specified by those arguments
f = interp2d(x_list,y_list,z_list,kind="linear")
x_coords = np.arange(min(x_list),max(x_list))
y_coords = np.arange(min(y_list),max(y_list))
z= f(x_coords,y_coords)
fig = plt.imshow(z,
extent=[min(x_list),max(x_list),min(y_list),max(y_list)],
origin="lower", interpolation='bicubic', cmap= 'jet', aspect='auto')
# Show the positions of the sample points, just to have some reference
fig.axes.set_autoscale_on(False)
#plt.scatter(x_list,y_list,400, facecolors='none')
plt.xlabel('X Values', fontsize = 15, va="center")
plt.ylabel('Y Values', fontsize = 15,va="center")
plt.title('Heatmap', fontsize = 20)
plt.tight_layout()
plt.show()
For your ease you can also use the X, Y, Z arrays instead of reading excel file.
The result that I am getting is:
Here you can see dark blue regions at (5,0) and (0,5). These are the FALSE ALARMS I am getting and I need to REMOVE these.
I am probably doing some beginner's mistake. Grateful to anyone who points it out. Regards
There are at least three problems in your example:
x_coords and y_coords are not properly resampled;
the interpolation z does to fill in the whole grid leading to incorrect output;
the output is then forced to be plotted on the original grid (extent) that add to the confusion.
Leading to the following interpolated results:
On what you have applied an extra smoothing with imshow.
Let's create your artificial input:
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(0, 11)
y = np.arange(0, 11)
X, Y = np.meshgrid(x, y)
Z = np.ones(X.shape)
Z[5,5] = 9
Depending on how you want to proceed, you can simply let imshow smooth your signal by interpolation:
fig, axe = plt.subplots()
axe.imshow(Z, origin="lower", cmap="jet", interpolation='bicubic')
And you are done, simple and efficient!
If you aim to do it by yourself, then choose the interpolant that suits you best and resample on a grid with a higher resolution:
interpolant = interpolate.interp2d(x, y, Z.ravel(), kind="linear")
xlin = np.linspace(0, 10, 101)
ylin = np.linspace(0, 10, 101)
zhat = interpolant(xlin, ylin)
fig, axe = plt.subplots()
axe.imshow(zhat, origin="lower", cmap="jet")
Have a deeper look on scipy.interpolate module to pick up the best interpolant regarding your needs. Notice that all methods does not expose the same interface for imputing parameters. You may need to reshape your data to use another objects.
MCVE
Here is a complete example using the trial data generated above. Just bind it to your excel columns:
# Flatten trial data to meet your requirement:
x = X.ravel()
y = Y.ravel()
z = Z.ravel()
# Resampling on as square grid with given resolution:
resolution = 11
xlin = np.linspace(x.min(), x.max(), resolution)
ylin = np.linspace(y.min(), y.max(), resolution)
Xlin, Ylin = np.meshgrid(xlin, ylin)
# Linear multi-dimensional interpolation:
interpolant = interpolate.NearestNDInterpolator([r for r in zip(x, y)], z)
Zhat = interpolant(Xlin.ravel(), Ylin.ravel()).reshape(Xlin.shape)
# Render and interpolate again if necessary:
fig, axe = plt.subplots()
axe.imshow(Zhat, origin="lower", cmap="jet", interpolation='bicubic')
Which renders as expected:
Essentially, I'm trying to make a 4-D scatter plot with 4 columns of data (see sample below).
X (mm) Y (mm) Z (mm) Diameter (mm)
11.096 11.0972 13.2401 124.279
14.6836 11.0389 8.37134 138.949
19.9543 11.1025 31.1912 138.949
15.4079 10.9505 31.1639 152.21
20.6372 14.5175 6.94501 152.211
20.47 11.225 31.3612 152.211
19.0432 11.3234 8.93819 152.213
29.4091 10.1331 26.6354 186.417
12.9391 10.6616 28.9523 186.418
29.9102 10.4828 25.1129 186.418
30.5483 12.163 15.9116 186.418
19.0631 10.5784 30.9791 186.418
9.65332 10.8563 12.975 186.419
8.4003 11.0417 17.0181 186.419
26.0134 10.6857 9.41572 186.419
13.7451 11.1495 28.7108 186.419
The first three columns of data (X, Y, Z) are the coordinate positions of the 4th column of data (Diameter) so I was able to generate a 3-D scatter plot of these positions. However, I'm trying to plot these Diameters with different color markers based on certain threshold values (ie. Diameters that are less than 100 mm are red, 101-200 mm are blue, 201-300 mm are green, etc.) Once the color of the markers are determined, it would plot these markers based on its X, Y, Z coordinates. I tried writing a simple for loop to do this, but for some reason it becomes very slow/laggy and will only plot one color too. Can anyone see if there's something wrong with my approach? Thanks!
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import pandas
import os
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
os.chdir(r'C:\Users\Me\Documents')
data = pandas.read_excel("Diameter Report", "Data")
X = data['X (mm)'].values.tolist()
Y = data['Y (mm)'].values.tolist()
Z = data['Z (mm)'].values.tolist()
dims = data['Diameter (mm)'].values.tolist()
for i in dims:
if i < int(100):
ax.plot(X, Y, Z, c='r', marker='o')
elif i >= int(101) and i <200:
ax.plot(X, Y, Z, c='b', marker='o')
elif i >= int(201) and i <300:
ax.plot(X, Y, Z, c='g', marker='o')
ax.set_xlabel('Center X (mm)')
ax.set_ylabel('Center Y (mm)')
ax.set_zlabel('Center Z (mm)')
plt.show()
It seems the thresholds for the values are equally spaced, so you can just divide by 100 and truncate further decimal places. This allows to plot a single scatter instead of hundreds of plots.
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import pandas
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
data = pandas.read_excel("Diameter Report", "Data")
X = data['X (mm)'].values
Y = data['Y (mm)'].values
Z = data['Z (mm)'].values
dims = data['Diameter (mm)'].values
ax.scatter(X,Y,Z, c=(dims/100).astype(int), marker="o", cmap="brg")
ax.set_xlabel('Center X (mm)')
ax.set_ylabel('Center Y (mm)')
ax.set_zlabel('Center Z (mm)')
plt.show()
The more general case of arbitrary boundaries would probably best be solved using a BoundaryNorm and a colormap with as many different colors as classifications.
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
d = np.random.rand(10,4)
d[:,3] = np.random.randint(1,300, 10)
data = pd.DataFrame(d, columns=["X (mm)","Y (mm)","Z (mm)","Diameter (mm)"])
X = data['X (mm)'].values
Y = data['Y (mm)'].values
Z = data['Z (mm)'].values
dims = data['Diameter (mm)'].values
bounds = [0,100,200,300]
colors = ["b", "r", "g"]
cmap = mcolors.ListedColormap(colors)
norm = mcolors.BoundaryNorm(bounds, len(colors))
sc = ax.scatter(X,Y,Z, c=dims, marker="o", cmap=cmap, norm=norm)
ax.set_xlabel('Center X (mm)')
ax.set_ylabel('Center Y (mm)')
ax.set_zlabel('Center Z (mm)')
fig.colorbar(sc)
plt.show()
Here is a slightly more general solution where you can explicitly specify the ranges you want regardless of the spacing. I did not have the complete data so I modified your limits from 100, 200, 300 to 140, 180, 200 based on the provided data.
A couple of things:
You probably want to use scatter3d as you mentioned it in your question instead of plot.
I am using NumPy to read in the data because this way you will have the data as NumPy arrays which make the masking and slicing easy.
Here I am creating 3 conditional masks depending on the magnitude of dims.
Next, you store these masks in a list and then iterate over it to use one mask at a time.
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import pandas
import numpy as np
import os
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
X, Y, Z, dims = np.loadtxt('sample.txt', unpack=True, skiprows=1)
mask1 = (dims<140)
mask2 = ((dims>=140) & (dims<180))
mask3 = ((dims>=180) & (dims<200))
masks = [mask1, mask2, mask3]
colors = ['r', 'b', 'g'] # color order as you specified in the question
for mask, color in zip(masks, colors):
ax.scatter3D(X[mask], Y[mask], Z[mask], c=color)
ax.set_xlabel('Center X (mm)')
ax.set_ylabel('Center Y (mm)')
ax.set_zlabel('Center Z (mm)')
plt.show()
I'm struggling with creating a quite complex 3d figure in python, specifically using iPython notebook. I can partition the content of the graph into two sections:
The (x,y) plane: Here a two-dimensional random walk is bobbing around, let's call it G(). I would like to plot part of this trajectory on the (x,y) plane. Say, 10% of all the data points of G(). As G() bobs around, it visits some (x,y) pairs more frequently than others. I would like to estimate this density of G() using a kernel estimation approach and draw it as contour lines on the (x,y) plane.
The (z) plane: Here, I would like to draw a mesh or (transparent) surface plot of the information theoretical surprise of a bivariate normal. Surprise is simply -log(p(i)) or the negative (base 2) logarithm of outcome i. Given the bivariate normal, each (x,y) pair has some probability p(x,y) and the surprise of this is simply -log(p(x,y)).
Essentially these two graphs are independent. Assume the interval of the random walk G() is [xmin,xmax],[ymin,ymax] and of size N. The bivariate normal in the z-plane should be drawn from the same interval, such that for each (x,y) pair in the random walk, I can draw a (dashed) line from some subset of the random walk n < N to the bivariate normal. Assume that G(10) = (5,5) then I would like to draw a dashed line from (5,5) up the Z-axes, until it hits the bivariate normal.
So far, I've managed to plot G() in a 3-d space, and estimate the density f(X,Y) using scipy.stats.gaussian_kde. In another (2d) graph, I have the sort of contour lines I want. What I don't have, is the contour lines in the 3d-plot using the estimated KDE density. I also don't have the bivariate normal plot, or the projection of a few random points from the random walk, to the surface of the bivariate normal. I've added a hand drawn figure, which might ease intuition (ignore the label on the z-axis and the fact that there is no mesh.. difficult to draw!)
Any input, even just partial, such as how to draw the contour lines in the (x,y) plane of the 3d graph, or a mesh of a bivariate normal would be much appreciated.
Thanks!
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
import numpy as np
import seaborn as sns
import scipy
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
def randomwalk():
mpl.rcParams['legend.fontsize'] = 10
xyz = []
cur = [0, 0]
for _ in range(400):
axis = random.randrange(0, 2)
cur[axis] += random.choice([-1, 1])
xyz.append(cur[:])
x, y = zip(*xyz)
data = np.vstack([x,y])
kde = scipy.stats.gaussian_kde(data)
density = kde(data)
fig1 = plt.figure()
ax = fig1.gca(projection='3d')
ax.plot(x, y, label='Random walk')
sns.kdeplot(data[0,:], data[1,:], 0)
ax.scatter(x[-1], y[-1], c='b', marker='o') # End point
ax.legend()
fig2 = plt.figure()
sns.kdeplot(data[0,:], data[1,:])
Calling randomwalk() initialises and plots this:
Edit #1:
Made some progress, actually the only thing I need is to restrict the height of the dashed vertical lines to the bivariate. Any ideas?
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
import numpy as np
import seaborn as sns
import scipy
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.mlab import bivariate_normal
%matplotlib inline
# Data for random walk
def randomwalk():
mpl.rcParams['legend.fontsize'] = 10
xyz = []
cur = [0, 0]
for _ in range(40):
axis = random.randrange(0, 2)
cur[axis] += random.choice([-1, 1])
xyz.append(cur[:])
# Get density
x, y = zip(*xyz)
data = np.vstack([x,y])
kde = scipy.stats.gaussian_kde(data)
density = kde(data)
# Data for bivariate gaussian
a = np.linspace(-7.5, 7.5, 20)
b = a
X,Y = np.meshgrid(a, b)
Z = bivariate_normal(X, Y)
surprise_Z = -np.log(Z)
# Get random points from walker and plot up z-axis to the gaussian
M = data[:,np.random.choice(20,5)].T
# Plot figure
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.plot(x, y, 'grey', label='Random walk') # Walker
ax.scatter(x[-1], y[-1], c='k', marker='o') # End point
ax.legend()
surf = ax.plot_surface(X, Y, surprise_Z, rstride=1, cstride=1,
cmap = plt.cm.gist_heat_r, alpha=0.1, linewidth=0.1)
#fig.colorbar(surf, shrink=0.5, aspect=7, cmap=plt.cm.gray_r)
for i in range(5):
ax.plot([M[i,0], M[i,0]],[M[i,1], M[i,1]], [0,10],'k--',alpha=0.8, linewidth=0.5)
ax.set_zlim(0, 50)
ax.set_xlim(-10, 10)
ax.set_ylim(-10, 10)
Final code,
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
import numpy as np
import seaborn as sns
import scipy
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.mlab import bivariate_normal
%matplotlib inline
# Data for random walk
def randomwalk():
mpl.rcParams['legend.fontsize'] = 10
xyz = []
cur = [0, 0]
for _ in range(50):
axis = random.randrange(0, 2)
cur[axis] += random.choice([-1, 1])
xyz.append(cur[:])
# Get density
x, y = zip(*xyz)
data = np.vstack([x,y])
kde = scipy.stats.gaussian_kde(data)
density = kde(data)
# Data for bivariate gaussian
a = np.linspace(-7.5, 7.5, 100)
b = a
X,Y = np.meshgrid(a, b)
Z = bivariate_normal(X, Y)
surprise_Z = -np.log(Z)
# Get random points from walker and plot up z-axis to the gaussian
M = data[:,np.random.choice(50,10)].T
# Plot figure
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.plot(x, y, 'grey', label='Random walk') # Walker
ax.legend()
surf = ax.plot_surface(X, Y, surprise_Z, rstride=1, cstride=1,
cmap = plt.cm.gist_heat_r, alpha=0.1, linewidth=0.1)
#fig.colorbar(surf, shrink=0.5, aspect=7, cmap=plt.cm.gray_r)
for i in range(10):
x = [M[i,0], M[i,0]]
y = [M[i,1], M[i,1]]
z = [0,-np.log(bivariate_normal(M[i,0],M[i,1]))]
ax.plot(x,y,z,'k--',alpha=0.8, linewidth=0.5)
ax.scatter(x, y, z, c='k', marker='o')
I have a random walker in the (x,y) plane and a -log(bivariate gaussian) in the (x,y,z) plane. These two datasets are essentially independent.
I want to sample, say 5 (x,y) pairs of the random walker and draw vertical lines up the z-axis and terminate the vertical line when it "meets" the bivariate gaussian.
This is my code so far:
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
import numpy as np
import seaborn as sns
import scipy
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.mlab import bivariate_normal
%matplotlib inline
# Data for random walk
def randomwalk():
mpl.rcParams['legend.fontsize'] = 10
xyz = []
cur = [0, 0]
for _ in range(40):
axis = random.randrange(0, 2)
cur[axis] += random.choice([-1, 1])
xyz.append(cur[:])
# Get density
x, y = zip(*xyz)
data = np.vstack([x,y])
kde = scipy.stats.gaussian_kde(data)
density = kde(data)
# Data for bivariate gaussian
a = np.linspace(-7.5, 7.5, 40)
b = a
X,Y = np.meshgrid(a, b)
Z = bivariate_normal(X, Y)
surprise_Z = -np.log(Z)
# Get random points from walker and plot up z-axis to the gaussian
M = data[:,np.random.choice(20,5)].T
# Plot figure
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.plot(x, y, 'grey', label='Random walk') # Walker
ax.scatter(x[-1], y[-1], c='k', marker='o') # End point
ax.legend()
surf = ax.plot_surface(X, Y, surprise_Z, rstride=1, cstride=1,
cmap = plt.cm.gist_heat_r, alpha=0.1, linewidth=0.1)
#fig.colorbar(surf, shrink=0.5, aspect=7, cmap=plt.cm.gray_r)
for i in range(5):
ax.plot([M[i,0], M[i,0]],[M[i,1], M[i,1]], [0,10],'k--',alpha=0.8, linewidth=0.5)
ax.set_zlim(0, 50)
ax.set_xlim(-10, 10)
ax.set_ylim(-10, 10)
Which produces
As you can see the only thing I'm struggling with is how to terminate the vertical lines when they meet the appropriate Z-value. Any ideas are welcome!
You're currently only letting those lines get to a height of 10 by using [0,10] as the z coordinates. You can change your loop to the following:
for i in range(5):
x = [M[i,0], M[i,0]]
y = [M[i,1], M[i,1]]
z = [0,-np.log(bivariate_normal(M[i,0],M[i,1]))]
ax.plot(x,y,z,'k--',alpha=0.8, linewidth=0.5)
This takes the x and y coordinates for each point you loop over and calculates the height of overlying Gaussian for that point and plots to there. Here is a plot with the linestyle changed to emphasize the lines relevant to the question:
I'd like to make a scatter plot where each point is colored by the spatial density of nearby points.
I've come across a very similar question, which shows an example of this using R:
R Scatter Plot: symbol color represents number of overlapping points
What's the best way to accomplish something similar in python using matplotlib?
In addition to hist2d or hexbin as #askewchan suggested, you can use the same method that the accepted answer in the question you linked to uses.
If you want to do that:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=100)
plt.show()
If you'd like the points to be plotted in order of density so that the densest points are always on top (similar to the linked example), just sort them by the z-values. I'm also going to use a smaller marker size here as it looks a bit better:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.show()
Plotting >100k data points?
The accepted answer, using gaussian_kde() will take a lot of time. On my machine, 100k rows took about 11 minutes. Here I will add two alternative methods (mpl-scatter-density and datashader) and compare the given answers with same dataset.
In the following, I used a test data set of 100k rows:
import matplotlib.pyplot as plt
import numpy as np
# Fake data for testing
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
Output & computation time comparison
Below is a comparison of different methods.
1: mpl-scatter-density
Installation
pip install mpl-scatter-density
Example code
import mpl_scatter_density # adds projection='scatter_density'
from matplotlib.colors import LinearSegmentedColormap
# "Viridis-like" colormap with white background
white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
(0, '#ffffff'),
(1e-20, '#440053'),
(0.2, '#404388'),
(0.4, '#2a788e'),
(0.6, '#21a784'),
(0.8, '#78d151'),
(1, '#fde624'),
], N=256)
def using_mpl_scatter_density(fig, x, y):
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density = ax.scatter_density(x, y, cmap=white_viridis)
fig.colorbar(density, label='Number of points per pixel')
fig = plt.figure()
using_mpl_scatter_density(fig, x, y)
plt.show()
Drawing this took 0.05 seconds:
And the zoom-in looks quite nice:
2: datashader
Datashader is an interesting project. It has added support for matplotlib in datashader 0.12.
Installation
pip install datashader
Code (source & parameterer listing for dsshow):
import datashader as ds
from datashader.mpl_ext import dsshow
import pandas as pd
def using_datashader(ax, x, y):
df = pd.DataFrame(dict(x=x, y=y))
dsartist = dsshow(
df,
ds.Point("x", "y"),
ds.count(),
vmin=0,
vmax=35,
norm="linear",
aspect="auto",
ax=ax,
)
plt.colorbar(dsartist)
fig, ax = plt.subplots()
using_datashader(ax, x, y)
plt.show()
It took 0.83 s to draw this:
There is also possibility to colorize by third variable. The third parameter for dsshow controls the coloring. See more examples here and the source for dsshow here.
3: scatter_with_gaussian_kde
def scatter_with_gaussian_kde(ax, x, y):
# https://stackoverflow.com/a/20107592/3015186
# Answer by Joel Kington
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
ax.scatter(x, y, c=z, s=100, edgecolor='')
It took 11 minutes to draw this:
4: using_hist2d
import matplotlib.pyplot as plt
def using_hist2d(ax, x, y, bins=(50, 50)):
# https://stackoverflow.com/a/20105673/3015186
# Answer by askewchan
ax.hist2d(x, y, bins, cmap=plt.cm.jet)
It took 0.021 s to draw this bins=(50,50):
It took 0.173 s to draw this bins=(1000,1000):
Cons: The zoomed-in data does not look as good as in with mpl-scatter-density or datashader. Also you have to determine the number of bins yourself.
5: density_scatter
The code is as in the answer by Guillaume.
It took 0.073 s to draw this with bins=(50,50):
It took 0.368 s to draw this with bins=(1000,1000):
Also, if the number of point makes KDE calculation too slow, color can be interpolated in np.histogram2d [Update in response to comments: If you wish to show the colorbar, use plt.scatter() instead of ax.scatter() followed by plt.colorbar()]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs ) :
"""
Scatter plot colored by 2d histogram
"""
if ax is None :
fig , ax = plt.subplots()
data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False)
#To be sure to plot all data
z[np.where(np.isnan(z))] = 0.0
# Sort the points by density, so that the densest points are plotted last
if sort :
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
ax.scatter( x, y, c=z, **kwargs )
norm = Normalize(vmin = np.min(z), vmax = np.max(z))
cbar = fig.colorbar(cm.ScalarMappable(norm = norm), ax=ax)
cbar.ax.set_ylabel('Density')
return ax
if "__main__" == __name__ :
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
density_scatter( x, y, bins = [30,30] )
You could make a histogram:
import numpy as np
import matplotlib.pyplot as plt
# fake data:
a = np.random.normal(size=1000)
b = a*3 + np.random.normal(size=1000)
plt.hist2d(a, b, (50, 50), cmap=plt.cm.jet)
plt.colorbar()