I want to create a plot for two different datasets similar to the one presented in this answer:
In the above image, the author managed to fix the overlapping problem of the error bars by adding some small random scatter in x to the new dataset.
In my problem, I must plot a similar graphic, but having some categorical data in the x axis:
Any ideas on how to slightly move one the error bars of the second dataset using categorical variables at the x axis? I want to avoid the overlapping between the bars for making the visualization easier.
You can translate each errorbar by adding the default data transform to a prior translation in data space. This is possible when knowing that categories are in general one data unit away from each other.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.transforms import Affine2D
x = list("ABCDEF")
y1, y2 = np.random.randn(2, len(x))
yerr1, yerr2 = np.random.rand(2, len(x))*4+0.3
fig, ax = plt.subplots()
trans1 = Affine2D().translate(-0.1, 0.0) + ax.transData
trans2 = Affine2D().translate(+0.1, 0.0) + ax.transData
er1 = ax.errorbar(x, y1, yerr=yerr1, marker="o", linestyle="none", transform=trans1)
er2 = ax.errorbar(x, y2, yerr=yerr2, marker="o", linestyle="none", transform=trans2)
plt.show()
Alternatively, you could translate the errorbars after applying the data transform and hence move them in units of points.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.transforms import ScaledTranslation
x = list("ABCDEF")
y1, y2 = np.random.randn(2, len(x))
yerr1, yerr2 = np.random.rand(2, len(x))*4+0.3
fig, ax = plt.subplots()
trans1 = ax.transData + ScaledTranslation(-5/72, 0, fig.dpi_scale_trans)
trans2 = ax.transData + ScaledTranslation(+5/72, 0, fig.dpi_scale_trans)
er1 = ax.errorbar(x, y1, yerr=yerr1, marker="o", linestyle="none", transform=trans1)
er2 = ax.errorbar(x, y2, yerr=yerr2, marker="o", linestyle="none", transform=trans2)
plt.show()
While results look similar in both cases, they are fundamentally different. You will observe this difference when interactively zooming the axes or changing the figure size.
Consider the following approach to highlight plots - combination of errorbar and fill_between with non-zero transparency:
import random
import matplotlib.pyplot as plt
# create sample data
N = 8
data_1 = {
'x': list(range(N)),
'y': [10. + random.random() for dummy in range(N)],
'yerr': [.25 + random.random() for dummy in range(N)]}
data_2 = {
'x': list(range(N)),
'y': [10.25 + .5 * random.random() for dummy in range(N)],
'yerr': [.5 * random.random() for dummy in range(N)]}
# plot
plt.figure()
# only errorbar
plt.subplot(211)
for data in [data_1, data_2]:
plt.errorbar(**data, fmt='o')
# errorbar + fill_between
plt.subplot(212)
for data in [data_1, data_2]:
plt.errorbar(**data, alpha=.75, fmt=':', capsize=3, capthick=1)
data = {
'x': data['x'],
'y1': [y - e for y, e in zip(data['y'], data['yerr'])],
'y2': [y + e for y, e in zip(data['y'], data['yerr'])]}
plt.fill_between(**data, alpha=.25)
Result:
Threre is example on lib site: https://matplotlib.org/stable/gallery/lines_bars_and_markers/errorbar_subsample.html
enter image description here
You need parameter errorevery=(m, n),
n - how often plot error lines, m - shift with range from 0 to n
Related
I am attempting to produce a plot like this which combines a cartesian scatter plot and a polar histogram. (Radial lines optional)
A similar solution (by Nicolas Legrand) exists for looking at differences in x and y (code here), but we need to look at ratios (i.e. x/y).
More specifically, this is useful when we want to look at the relative risk measure which is the ratio of two probabilities.
The scatter plot on it's own is obviously not a problem, but the polar histogram is more advanced.
The most promising lead I have found is this central example from the matplotlib gallery here
I have attempted to do this, but have run up against the limits of my matplotlib skills. Any efforts moving towards this goal would be great.
I'm sure that others will have better suggestions, but one method that gets something like you want (without the need for extra axes artists) is to use a polar projection with a scatter and bar chart together. Something like
import matplotlib.pyplot as plt
import numpy as np
x = np.random.uniform(size=100)
y = np.random.uniform(size=100)
r = np.sqrt(x**2 + y**2)
phi = np.arctan2(y, x)
h, b = np.histogram(phi, bins=np.linspace(0, np.pi/2, 21), density=True)
colors = plt.cm.Spectral(h / h.max())
ax = plt.subplot(111, projection='polar')
ax.scatter(phi, r, marker='.')
ax.bar(b[:-1], h, width=b[1:] - b[:-1],
align='edge', bottom=np.max(r) + 0.2, color=colors)
# Cut off at 90 degrees
ax.set_thetamax(90)
# Set the r grid to cover the scatter plot
ax.set_rgrids([0, 0.5, 1])
# Let's put a line at 1 assuming we want a ratio of some sort
ax.set_thetagrids([45], [1])
which will give
It is missing axes labels and some beautification, but it might be a place to start. I hope it is helpful.
You can use two axes on top of each other:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_axes([0.1,0.1,.8,.8], label="cartesian")
ax2 = fig.add_axes([0.1,0.1,.8,.8], projection="polar", label="polar")
ax2.set_rorigin(-1)
ax2.set_thetamax(90)
plt.show()
Ok. Thanks to the answer from Nicolas, and the answer from tomjn I have a working solution :)
import numpy as np
import matplotlib.pyplot as plt
# Scatter data
n = 50
x = 0.3 + np.random.randn(n)*0.1
y = 0.4 + np.random.randn(n)*0.02
def radial_corner_plot(x, y, n_hist_bins=51):
"""Scatter plot with radial histogram of x/y ratios"""
# Axis setup
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_axes([0.1,0.1,.6,.6], label="cartesian")
ax2 = fig.add_axes([0.1,0.1,.8,.8], projection="polar", label="polar")
ax2.set_rorigin(-20)
ax2.set_thetamax(90)
# define useful constant
offset_in_radians = np.pi/4
def rotate_hist_axis(ax):
"""rotate so that 0 degrees is pointing up and right"""
ax.set_theta_offset(offset_in_radians)
ax.set_thetamin(-45)
ax.set_thetamax(45)
return ax
# Convert scatter data to histogram data
r = np.sqrt(x**2 + y**2)
phi = np.arctan2(y, x)
h, b = np.histogram(phi,
bins=np.linspace(0, np.pi/2, n_hist_bins),
density=True)
# SCATTER PLOT -------------------------------------------------------
ax1.scatter(x,y)
ax1.set(xlim=[0, 1], ylim=[0, 1], xlabel="x", ylabel="y")
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
# HISTOGRAM ----------------------------------------------------------
ax2 = rotate_hist_axis(ax2)
# rotation of axis requires rotation in bin positions
b = b - offset_in_radians
# plot the histogram
bars = ax2.bar(b[:-1], h, width=b[1:] - b[:-1], align='edge')
def update_hist_ticks(ax, desired_ratios):
"""Update tick positions and corresponding tick labels"""
x = np.ones(len(desired_ratios))
y = 1/desired_ratios
phi = np.arctan2(y,x) - offset_in_radians
# define ticklabels
xticklabels = [str(round(float(label), 2)) for label in desired_ratios]
# apply updates
ax2.set(xticks=phi, xticklabels=xticklabels)
return ax
ax2 = update_hist_ticks(ax2, np.array([1/8, 1/4, 1/2, 1, 2, 4, 8]))
# just have radial grid lines
ax2.grid(which="major", axis="y")
# remove bin count labels
ax2.set_yticks([])
return (fig, [ax1, ax2])
fig, ax = radial_corner_plot(x, y)
Thanks for the pointers!
I have 2 line plots on the same figure, plotted from pandas dataframes.
I want to fill between them with a gradient/colour map of sorts.
I understand I can do this with a cmap, only it will not work for me (see code below).
General example I found are filling between x axis and line, i do not want that and also i am interested in simplest solution possible for this as i am a begginer at this and complicated, though maybe better code will just make it more confusing honestly.
Code for which fill is plain blue:
import matplotlib.pyplot as plt
import pandas as pd
ax = plt.gca()
df0.plot(kind='line', x='something', y='other', color='orange', ax=ax, legend=False, figsize=(20,10))
df1.plot(kind='line', x='something', y='other2', color='c', ax=ax, legend=False, figsize=(20,10))
ax.fill_between(x=df0['daysInAYear'], y1=df0['other'], y2 = df1['other2'], alpha=0.2, cmap=plt.cm.get_cmap("winter"))
plt.show()
EDIT/UPDATE: DATA EXAMPLE
other is ALWAYS >= other2
other other2 something (same for both)
15.6 -16.0 1
13.9 -26.7 2
13.3 -26.7 3
10.6 -26.1 4
12.8 -15.0 5
Final graph example:
I would like the fill to go from orange on top to blue at the bottom
Edit
In response to the edited question, here is an alternative approach which does the gradient vertically but doesn't use imshow.
import matplotlib.pyplot as plt
from matplotlib import colors, patches
import numpy as np
import pandas as pd
n = 100
nc = 100
x = np.linspace(0, np.pi*5, n)
y1 = [-50.0]
y2 = [50.0]
for ii in range(1, n):
y1.append(y1[ii-1] + (np.random.random()-0.3)*3)
y2.append(y2[ii-1] + (np.random.random()-0.5)*3)
y1 = np.array(y1)
y2 = np.array(y2)
z = np.linspace(0, 10, nc)
normalize = colors.Normalize(vmin=z.min(), vmax=z.max())
cmap = plt.cm.get_cmap('winter')
fig, ax = plt.subplots(1)
for ii in range(len(df['x'].values)-1):
y = np.linspace(y1[ii], y2[ii], nc)
yn = np.linspace(y1[ii+1], y2[ii+1], nc)
for kk in range(nc - 1):
p = patches.Polygon([[x[ii], y[kk]],
[x[ii+1], yn[kk]],
[x[ii+1], yn[kk+1]],
[x[ii], y[kk+1]]], color=cmap(normalize(z[kk])))
ax.add_patch(p)
plt.plot(x, y1, 'k-', lw=1)
plt.plot(x, y2, 'k-', lw=1)
plt.show()
The idea here being similar to that in my original answer, except the trapezoids are divided into nc pieces and each piece is colored separately. This has the advantage of scaling correctly for varying y1[ii], y2[ii] distances, as shown in this comparison,
It does, however, have the disadvantages of being much, much slower than imshow or the horizontal gradient method and of being unable to handle 'crossing' correctly.
The code to generate the second image in the above comparison:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import patches
from matplotlib.path import Path
x = np.linspace(0, 10, n)
y1 = [-50.0]
y2 = [50.0]
for ii in range(1, n):
y1.append(y1[ii-1] + (np.random.random()-0.2)*3)
y2.append(y2[ii-1] + (np.random.random()-0.5)*3)
y1 = np.array(y1)
y2 = np.array(y2)
verts = np.vstack([np.stack([x, y1], 1), np.stack([np.flip(x), np.flip(y2)], 1)])
path = Path(verts)
patch = patches.PathPatch(path, facecolor='k', lw=2, alpha=0.0)
plt.gca().add_patch(patch)
plt.imshow(np.arange(10).reshape(10,-1), cmap=plt.cm.winter, interpolation="bicubic",
origin='upper', extent=[0,10,-60,60], aspect='auto', clip_path=patch,
clip_on=True)
plt.show()
Original
This is a bit of a hack, partly based on the answers in this question. It does seem to work fairly well but works best with higher density along the x axis. The idea is to call fill_between separately for each trapezoid corresponding to x pairs, [x[ii], x[ii+1]]. Here is a complete example using some generated data
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import pandas as pd
n = 1000
X = np.linspace(0, np.pi*5, n)
Y1 = np.sin(X)
Y2 = np.cos(X)
Z = np.linspace(0, 10, n)
normalize = colors.Normalize(vmin=Z.min(), vmax=Z.max())
cmap = plt.cm.get_cmap('winter')
df = pd.DataFrame({'x': X, 'y1': Y1, 'y2': Y2, 'z': Z})
x = df['x'].values
y1 = df['y1'].values
y2 = df['y2'].values
z = df['z'].values
for ii in range(len(df['x'].values)-1):
plt.fill_between([x[ii], x[ii+1]], [y1[ii], y1[ii+1]],
[y2[ii], y2[ii+1]], color=cmap(normalize(z[ii])))
plt.plot(x, y1, 'k-', x, y2, 'k-')
plt.show()
This can be generalized to a 2 dimensional color grid but would require non-trivial modification
I am scatter ploting data points with a very small marker (see screengrab below). When I use the very small marker ',' the legend is very hard to read (example code taken from here).
(Python 3, Jupyter lab)
How can I increase the size of the marker in the legend. The two versions shown on the above mentioned site do not work:
legend = ax.legend(frameon=True)
for legend_handle in legend.legendHandles:
legend_handle._legmarker.set_markersize(9)
and
ax.legend(markerscale=6)
The two solutions do however work when the marker is set to '.'.
How can I show bigger makers in the legend?
Sample Code from intoli.com:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, ',', label=f'Cluster {i + 1}')
ax.legend(markerscale=12)
fig.tight_layout()
plt.show()
You can get 1 pixel sized markers for a plot by setting the markersize to 1 pixel. This would look like
plt.plot(x, y, marker='s', markersize=72./fig.dpi, mec="None", ls="None")
What the above does is set the marker to a square, set the markersize to the ppi (points per inch) divided by dpi (dots per inch) == dots == pixels, and removes lines and edges.
Then the solution you tried using markerscale in the legend works nicely.
Complete example:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, marker='s', markersize=72./fig.dpi, mec="None", ls="None",
label=f'Cluster {i + 1}')
ax.legend(markerscale=12)
fig.tight_layout()
plt.show()
According to this discussion, the markersize has no effect when using pixels (,) as marker. How about generating a custom legend instead? For example, by adapting the first example in this tutorial, one can get a pretty decent legend:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, ',', label=f'Cluster {i + 1}')
##generating custom legend
handles, labels = ax.get_legend_handles_labels()
patches = []
for handle, label in zip(handles, labels):
patches.append(mpatches.Patch(color=handle.get_color(), label=label))
legend = ax.legend(handles=patches)
fig.tight_layout()
plt.show()
The output would look like this:
I'd like to make a scatter plot where each point is colored by the spatial density of nearby points.
I've come across a very similar question, which shows an example of this using R:
R Scatter Plot: symbol color represents number of overlapping points
What's the best way to accomplish something similar in python using matplotlib?
In addition to hist2d or hexbin as #askewchan suggested, you can use the same method that the accepted answer in the question you linked to uses.
If you want to do that:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=100)
plt.show()
If you'd like the points to be plotted in order of density so that the densest points are always on top (similar to the linked example), just sort them by the z-values. I'm also going to use a smaller marker size here as it looks a bit better:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.show()
Plotting >100k data points?
The accepted answer, using gaussian_kde() will take a lot of time. On my machine, 100k rows took about 11 minutes. Here I will add two alternative methods (mpl-scatter-density and datashader) and compare the given answers with same dataset.
In the following, I used a test data set of 100k rows:
import matplotlib.pyplot as plt
import numpy as np
# Fake data for testing
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
Output & computation time comparison
Below is a comparison of different methods.
1: mpl-scatter-density
Installation
pip install mpl-scatter-density
Example code
import mpl_scatter_density # adds projection='scatter_density'
from matplotlib.colors import LinearSegmentedColormap
# "Viridis-like" colormap with white background
white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
(0, '#ffffff'),
(1e-20, '#440053'),
(0.2, '#404388'),
(0.4, '#2a788e'),
(0.6, '#21a784'),
(0.8, '#78d151'),
(1, '#fde624'),
], N=256)
def using_mpl_scatter_density(fig, x, y):
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density = ax.scatter_density(x, y, cmap=white_viridis)
fig.colorbar(density, label='Number of points per pixel')
fig = plt.figure()
using_mpl_scatter_density(fig, x, y)
plt.show()
Drawing this took 0.05 seconds:
And the zoom-in looks quite nice:
2: datashader
Datashader is an interesting project. It has added support for matplotlib in datashader 0.12.
Installation
pip install datashader
Code (source & parameterer listing for dsshow):
import datashader as ds
from datashader.mpl_ext import dsshow
import pandas as pd
def using_datashader(ax, x, y):
df = pd.DataFrame(dict(x=x, y=y))
dsartist = dsshow(
df,
ds.Point("x", "y"),
ds.count(),
vmin=0,
vmax=35,
norm="linear",
aspect="auto",
ax=ax,
)
plt.colorbar(dsartist)
fig, ax = plt.subplots()
using_datashader(ax, x, y)
plt.show()
It took 0.83 s to draw this:
There is also possibility to colorize by third variable. The third parameter for dsshow controls the coloring. See more examples here and the source for dsshow here.
3: scatter_with_gaussian_kde
def scatter_with_gaussian_kde(ax, x, y):
# https://stackoverflow.com/a/20107592/3015186
# Answer by Joel Kington
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
ax.scatter(x, y, c=z, s=100, edgecolor='')
It took 11 minutes to draw this:
4: using_hist2d
import matplotlib.pyplot as plt
def using_hist2d(ax, x, y, bins=(50, 50)):
# https://stackoverflow.com/a/20105673/3015186
# Answer by askewchan
ax.hist2d(x, y, bins, cmap=plt.cm.jet)
It took 0.021 s to draw this bins=(50,50):
It took 0.173 s to draw this bins=(1000,1000):
Cons: The zoomed-in data does not look as good as in with mpl-scatter-density or datashader. Also you have to determine the number of bins yourself.
5: density_scatter
The code is as in the answer by Guillaume.
It took 0.073 s to draw this with bins=(50,50):
It took 0.368 s to draw this with bins=(1000,1000):
Also, if the number of point makes KDE calculation too slow, color can be interpolated in np.histogram2d [Update in response to comments: If you wish to show the colorbar, use plt.scatter() instead of ax.scatter() followed by plt.colorbar()]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs ) :
"""
Scatter plot colored by 2d histogram
"""
if ax is None :
fig , ax = plt.subplots()
data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False)
#To be sure to plot all data
z[np.where(np.isnan(z))] = 0.0
# Sort the points by density, so that the densest points are plotted last
if sort :
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
ax.scatter( x, y, c=z, **kwargs )
norm = Normalize(vmin = np.min(z), vmax = np.max(z))
cbar = fig.colorbar(cm.ScalarMappable(norm = norm), ax=ax)
cbar.ax.set_ylabel('Density')
return ax
if "__main__" == __name__ :
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
density_scatter( x, y, bins = [30,30] )
You could make a histogram:
import numpy as np
import matplotlib.pyplot as plt
# fake data:
a = np.random.normal(size=1000)
b = a*3 + np.random.normal(size=1000)
plt.hist2d(a, b, (50, 50), cmap=plt.cm.jet)
plt.colorbar()
I'd like to make a scatter plot where each point is colored by the spatial density of nearby points.
I've come across a very similar question, which shows an example of this using R:
R Scatter Plot: symbol color represents number of overlapping points
What's the best way to accomplish something similar in python using matplotlib?
In addition to hist2d or hexbin as #askewchan suggested, you can use the same method that the accepted answer in the question you linked to uses.
If you want to do that:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=100)
plt.show()
If you'd like the points to be plotted in order of density so that the densest points are always on top (similar to the linked example), just sort them by the z-values. I'm also going to use a smaller marker size here as it looks a bit better:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.show()
Plotting >100k data points?
The accepted answer, using gaussian_kde() will take a lot of time. On my machine, 100k rows took about 11 minutes. Here I will add two alternative methods (mpl-scatter-density and datashader) and compare the given answers with same dataset.
In the following, I used a test data set of 100k rows:
import matplotlib.pyplot as plt
import numpy as np
# Fake data for testing
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
Output & computation time comparison
Below is a comparison of different methods.
1: mpl-scatter-density
Installation
pip install mpl-scatter-density
Example code
import mpl_scatter_density # adds projection='scatter_density'
from matplotlib.colors import LinearSegmentedColormap
# "Viridis-like" colormap with white background
white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
(0, '#ffffff'),
(1e-20, '#440053'),
(0.2, '#404388'),
(0.4, '#2a788e'),
(0.6, '#21a784'),
(0.8, '#78d151'),
(1, '#fde624'),
], N=256)
def using_mpl_scatter_density(fig, x, y):
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density = ax.scatter_density(x, y, cmap=white_viridis)
fig.colorbar(density, label='Number of points per pixel')
fig = plt.figure()
using_mpl_scatter_density(fig, x, y)
plt.show()
Drawing this took 0.05 seconds:
And the zoom-in looks quite nice:
2: datashader
Datashader is an interesting project. It has added support for matplotlib in datashader 0.12.
Installation
pip install datashader
Code (source & parameterer listing for dsshow):
import datashader as ds
from datashader.mpl_ext import dsshow
import pandas as pd
def using_datashader(ax, x, y):
df = pd.DataFrame(dict(x=x, y=y))
dsartist = dsshow(
df,
ds.Point("x", "y"),
ds.count(),
vmin=0,
vmax=35,
norm="linear",
aspect="auto",
ax=ax,
)
plt.colorbar(dsartist)
fig, ax = plt.subplots()
using_datashader(ax, x, y)
plt.show()
It took 0.83 s to draw this:
There is also possibility to colorize by third variable. The third parameter for dsshow controls the coloring. See more examples here and the source for dsshow here.
3: scatter_with_gaussian_kde
def scatter_with_gaussian_kde(ax, x, y):
# https://stackoverflow.com/a/20107592/3015186
# Answer by Joel Kington
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
ax.scatter(x, y, c=z, s=100, edgecolor='')
It took 11 minutes to draw this:
4: using_hist2d
import matplotlib.pyplot as plt
def using_hist2d(ax, x, y, bins=(50, 50)):
# https://stackoverflow.com/a/20105673/3015186
# Answer by askewchan
ax.hist2d(x, y, bins, cmap=plt.cm.jet)
It took 0.021 s to draw this bins=(50,50):
It took 0.173 s to draw this bins=(1000,1000):
Cons: The zoomed-in data does not look as good as in with mpl-scatter-density or datashader. Also you have to determine the number of bins yourself.
5: density_scatter
The code is as in the answer by Guillaume.
It took 0.073 s to draw this with bins=(50,50):
It took 0.368 s to draw this with bins=(1000,1000):
Also, if the number of point makes KDE calculation too slow, color can be interpolated in np.histogram2d [Update in response to comments: If you wish to show the colorbar, use plt.scatter() instead of ax.scatter() followed by plt.colorbar()]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs ) :
"""
Scatter plot colored by 2d histogram
"""
if ax is None :
fig , ax = plt.subplots()
data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False)
#To be sure to plot all data
z[np.where(np.isnan(z))] = 0.0
# Sort the points by density, so that the densest points are plotted last
if sort :
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
ax.scatter( x, y, c=z, **kwargs )
norm = Normalize(vmin = np.min(z), vmax = np.max(z))
cbar = fig.colorbar(cm.ScalarMappable(norm = norm), ax=ax)
cbar.ax.set_ylabel('Density')
return ax
if "__main__" == __name__ :
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
density_scatter( x, y, bins = [30,30] )
You could make a histogram:
import numpy as np
import matplotlib.pyplot as plt
# fake data:
a = np.random.normal(size=1000)
b = a*3 + np.random.normal(size=1000)
plt.hist2d(a, b, (50, 50), cmap=plt.cm.jet)
plt.colorbar()