I'm trying to build a scatterplot of a large amount of data from multiple classes in python/matplotlib. Unfortunately, it appears that I have to choose between having my data randomised and having legend labels. Is there a way I can have both (preferably without manually coding the labels?)
Minimum reproducible example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
X = np.random.normal(0, 1, [5000, 2])
Y = np.random.normal(0.5, 1, [5000, 2])
data = np.concatenate([X,Y])
classes = np.concatenate([np.repeat('X', X.shape[0]),
np.repeat('Y', Y.shape[0])])
Plotting with randomized points:
plot_idx = np.random.permutation(data.shape[0])
colors = pd.factorize(classes)
fig, ax = plt.subplots()
ax.scatter(data[plot_idx, 0],
data[plot_idx, 1],
c=colors[plot_idx],
label=classes[plot_idx],
alpha=0.4)
plt.legend()
plt.show()
This gives me the wrong legend.
Plotting with the correct legend:
from matplotlib import cm
unique_classes = np.unique(classes)
colors = cm.Set1(np.linspace(0, 1, len(unique_classes)))
for i, class in enumerate(unique_classes):
ax.scatter(data[classes == class, 0],
data[classes == class, 1],
c=colors[i],
label=class,
alpha=0.4)
plt.legend()
plt.show()
But now the points are not randomized and the resulting plot is not representative of the data.
I'm looking for something that would give me a result like I get as follows in R:
library(ggplot2)
X <- matrix(rnorm(10000, 0, 1), ncol=2)
Y <- matrix(rnorm(10000, 0.5, 1), ncol=2)
data <- as.data.frame(rbind(X, Y))
data$classes <- rep(c('X', 'Y'), times=nrow(X))
plot_idx <- sample(nrow(data))
ggplot(data[plot_idx,], aes(x=V1, y=V2, color=classes)) +
geom_point(alpha=0.4, size=3)
You need to create the legend manually. This is not a big problem though. You can loop over the labels and create a legend entry for each. Here one may use a Line2D with a marker similar to the scatter as handle.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
X = np.random.normal(0, 1, [5000, 2])
Y = np.random.normal(0.5, 1, [5000, 2])
data = np.concatenate([X,Y])
classes = np.concatenate([np.repeat('X', X.shape[0]),
np.repeat('Y', Y.shape[0])])
plot_idx = np.random.permutation(data.shape[0])
colors,labels = pd.factorize(classes)
fig, ax = plt.subplots()
sc = ax.scatter(data[plot_idx, 0],
data[plot_idx, 1],
c=colors[plot_idx],
alpha=0.4)
h = lambda c: plt.Line2D([],[],color=c, ls="",marker="o")
plt.legend(handles=[h(sc.cmap(sc.norm(i))) for i in range(len(labels))],
labels=list(labels))
plt.show()
Alternatively you can use a special scatter handler, as shown in the quesiton Why doesn't the color of the points in a scatter plot match the color of the points in the corresponding legend? but that seems a bit overkill here.
It's a bit of a hack, but you can save the axis limits, set the labels by drawing points well outside the limits of the plot, and then resetting the axis limits as follows:
plot_idx = np.random.permutation(data.shape[0])
color_idx, unique_classes = pd.factorize(classes)
colors = cm.Set1(np.linspace(0, 1, len(unique_classes)))
fig, ax = plt.subplots()
ax.scatter(data[plot_idx, 0],
data[plot_idx, 1],
c=colors[color_idx[plot_idx]],
alpha=0.4)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
for i in range(len(unique_classes)):
ax.scatter(xlim[1]*10,
ylim[1]*10,
c=colors[i],
label=unique_classes[i])
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.legend()
plt.show()
Related
I am plotting separate figures for each attribute and label for each data sample. Here is the illustration:
As illustrated in the the last subplot (Label), my data contains seven classes (numerically) (0 to 6). I'd like to visualize these classes using a different fancy colors and a legend. Please note that I just want colors for last subplot. How should I do that?
Here is the code of above plot:
x, y = test_data["x"], test_data["y"]
# determine the total number of plots
n, off = x.shape[1] + 1, 0
plt.rcParams["figure.figsize"] = (40, 15)
# plot all the attributes
for i in range(6):
plt.subplot(n, 1, off + 1)
plt.plot(x[:, off])
plt.title('Attribute:' + str(i), y=0, loc='left')
off += 1
# plot Labels
plt.subplot(n, 1, n)
plt.plot(y)
plt.title('Label', y=0, loc='left')
plt.savefig(save_file_name, bbox_inches="tight")
plt.close()
First, just to set up a similar dataset:
import matplotlib.pyplot as plt
import numpy as np
x = np.random.random((100,6))
y = np.random.randint(0, 6, (100))
fig, axs = plt.subplots(6, figsize=(40,15))
We could use plt.scatter() to give individual points different marker styles:
for i in range(x.shape[-1]):
axs[i].scatter(range(x.shape[0]), x[:,i], c=y)
Or we could mask the arrays we're plotting:
for i in range(x.shape[-1]):
for j in np.unique(y):
axs[i].plot(np.ma.masked_where(y!=j, x[:,i]), 'o')
Either way we get the same results:
Edit: Ah you've edited your question! You can do exactly the same thing for your last plot only, just modify my code above to take it out of the loop of subplots :)
As suggested, we imitate the matplotlib step function by creating a LineCollection to color the different line segments:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.collections import LineCollection
from matplotlib.patches import Patch
#random data generation
np.random.seed(12345)
number_of_categories=4
y = np.concatenate([np.repeat(np.random.randint(0, number_of_categories), np.random.randint(1, 30)) for _ in range(20)])
#check the results with less points
#y = y[:10]
x = y[None] * np.linspace(1, 5, 3)[:, None]
x += 2 * np.random.random(x.shape) - 1
#your initial plot
num_plots = x.shape[0] + 1
fig, axes = plt.subplots(num_plots, 1, sharex=True, figsize=(10, 8))
for i, ax in enumerate(axes.flat[:-1]):
ax.plot(x[i,:])
#first we create the matplotlib step function with x-values as their midpoint
axes.flat[-1].step(np.arange(y.size), y, where="mid", color="lightgrey", zorder=-1)
#then we plot colored segments with shifted index simulating the step function
shifted_x = np.arange(y.size+1)-0.5
#and identify the step indexes
idx_steps, = np.nonzero(np.diff(y, prepend=np.inf, append=np.inf))
#create collection of plateau segments
colored_segments = np.zeros((idx_steps.size-1, 2, 2))
colored_segments[:, :, 0] = np.vstack((shifted_x[idx_steps[:-1]], shifted_x[idx_steps[1:]])).T
colored_segments[:, :, 1] = np.repeat(y[idx_steps[:-1]], 2).reshape(-1, 2)
#generate discrete color list
n_levels, idx_levels = np.unique(y[idx_steps[:-1]], return_inverse=True)
colorarr = np.asarray(plt.cm.tab10.colors[:n_levels.size])
#and plot the colored segments
lc_cs = LineCollection(colored_segments, colors=colorarr[idx_levels, :], lw=10)
lines_cs = axes.flat[-1].add_collection(lc_cs)
#scaling and legend generation
axes.flat[-1].set_ylim(n_levels.min()-0.5, n_levels.max()+0.5)
axes.flat[-1].legend([Patch(color=colorarr[i, :]) for i, _ in enumerate(n_levels)],
[f"cat {i}" for i in n_levels],
loc="upper center", bbox_to_anchor=(0.5, -0.15),
ncol=n_levels.size)
plt.show()
Sample output:
Alternatively, you can use broken barh plots or color this axis or even all axes using axvspan.
I am scatter ploting data points with a very small marker (see screengrab below). When I use the very small marker ',' the legend is very hard to read (example code taken from here).
(Python 3, Jupyter lab)
How can I increase the size of the marker in the legend. The two versions shown on the above mentioned site do not work:
legend = ax.legend(frameon=True)
for legend_handle in legend.legendHandles:
legend_handle._legmarker.set_markersize(9)
and
ax.legend(markerscale=6)
The two solutions do however work when the marker is set to '.'.
How can I show bigger makers in the legend?
Sample Code from intoli.com:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, ',', label=f'Cluster {i + 1}')
ax.legend(markerscale=12)
fig.tight_layout()
plt.show()
You can get 1 pixel sized markers for a plot by setting the markersize to 1 pixel. This would look like
plt.plot(x, y, marker='s', markersize=72./fig.dpi, mec="None", ls="None")
What the above does is set the marker to a square, set the markersize to the ppi (points per inch) divided by dpi (dots per inch) == dots == pixels, and removes lines and edges.
Then the solution you tried using markerscale in the legend works nicely.
Complete example:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, marker='s', markersize=72./fig.dpi, mec="None", ls="None",
label=f'Cluster {i + 1}')
ax.legend(markerscale=12)
fig.tight_layout()
plt.show()
According to this discussion, the markersize has no effect when using pixels (,) as marker. How about generating a custom legend instead? For example, by adapting the first example in this tutorial, one can get a pretty decent legend:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
np.random.seed(12)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for i in range(5):
mean = [np.random.random()*10, np.random.random()*10]
covariance = [ [1 + np.random.random(), np.random.random() - 1], [0, 1 + np.random.random()], ]
covariance[1][0] = covariance[0][1] # must be symmetric
x, y = np.random.multivariate_normal(mean, covariance, 3000).T
plt.plot(x, y, ',', label=f'Cluster {i + 1}')
##generating custom legend
handles, labels = ax.get_legend_handles_labels()
patches = []
for handle, label in zip(handles, labels):
patches.append(mpatches.Patch(color=handle.get_color(), label=label))
legend = ax.legend(handles=patches)
fig.tight_layout()
plt.show()
The output would look like this:
I would like to generate a series of histogram shown below:
The above visualization was done in tensorflow but I'd like to reproduce the same visualization on matplotlib.
EDIT:
Using plt.fill_between suggested by #SpghttCd, I have the following code:
colors=cm.OrRd_r(np.linspace(.2, .6, 10))
plt.figure()
x = np.arange(100)
for i in range(10):
y = np.random.rand(100)
plt.fill_between(x, y + 10-i, 10-i,
facecolor=colors[i]
edgecolor='w')
plt.show()
This works great, but is it possible to use histogram instead of a continuous curve?
EDIT:
joypy based approach, like mentioned in the comment of october:
import pandas as pd
import joypy
import numpy as np
df = pd.DataFrame()
for i in range(0, 400, 20):
df[i] = np.random.normal(i/410*5, size=30)
joypy.joyplot(df, overlap=2, colormap=cm.OrRd_r, linecolor='w', linewidth=.5)
for finer control of colors, you can define a color gradient function which accepts a fractional index and start and stop color tuples:
def color_gradient(x=0.0, start=(0, 0, 0), stop=(1, 1, 1)):
r = np.interp(x, [0, 1], [start[0], stop[0]])
g = np.interp(x, [0, 1], [start[1], stop[1]])
b = np.interp(x, [0, 1], [start[2], stop[2]])
return (r, g, b)
Usage:
joypy.joyplot(df, overlap=2, colormap=lambda x: color_gradient(x, start=(.78, .25, .09), stop=(1.0, .64, .44)), linecolor='w', linewidth=.5)
Examples with different start and stop tuples:
original answer:
You could iterate over your dataarrays you'd like to plot with plt.fill_between, setting colors to some gradient and the line color to white:
creating some sample data:
import numpy as np
t = np.linspace(-1.6, 1.6, 11)
y = np.cos(t)**2
y2 = lambda : y + np.random.random(len(y))/5-.1
plot the series:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
colors = cm.OrRd_r(np.linspace(.2, .6, 10))
plt.figure()
for i in range(10):
plt.fill_between(t+i, y2()+10-i/10, 10-i/10, facecolor = colors[i], edgecolor='w')
If you want it to have more optimized towards your example you should perhaps consider providing some sample data.
EDIT:
As I commented below, I'm not quite sure if I understand what you want - or if you want the best for your task. Therefore here a code which plots besides your approach in your edit two smples of how to present a bunch of histograms in a way that they are better comparable:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
N = 10
np.random.seed(42)
colors=cm.OrRd_r(np.linspace(.2, .6, N))
fig1 = plt.figure()
x = np.arange(100)
for i in range(10):
y = np.random.rand(100)
plt.fill_between(x, y + 10-i, 10-i,
facecolor=colors[i],
edgecolor='w')
data = np.random.binomial(20, .3, (N, 100))
fig2, axs = plt.subplots(N, figsize=(10, 6))
for i, d in enumerate(data):
axs[i].hist(d, range(20), color=colors[i], label=str(i))
fig2.legend(loc='upper center', ncol=5)
fig3, ax = plt.subplots(figsize=(10, 6))
ax.hist(data.T, range(20), color=colors, label=[str(i) for i in range(N)])
fig3.legend(loc='upper center', ncol=5)
This leads to the following plots:
your plot from your edit:
N histograms in N subplots:
N histograms side by side in one plot:
I have sparse scatter plot to visualize the comparison of predicted vs actual values. The range of the values are 1-4 and there are no decimal points.
I have tried plotly so far with hte following code (but I can also use a matplotlib solution):
my_scatter = go.Scatter(
x = y_actual, y = y_pred, mode = 'markers',
marker = dict(color = 'rgb(240, 189, 89)', opacity=0.5)
)
This prints the graph nicely (see below). I use opacity to see the density at each point. I.e. if two points lie on top of each other, the point will be shown in darker color. However, this is not explanatory enough. Is it possible to add the counts at each point as a label? There are some overlaps at certain intersections. I want to display how many points intersects. Can this be done automatically using matplotlib or plotly?
This answer uses matplotlib.
To answer the initial question first: You need to find out how often the data produces a point at a given coordinate to be able to annotate the points. If all values are integers this can easily be done using a 2d histogram. Out of the hstogram one would then select only those bins where the count value is nonzero and annotate the respective values in a loop:
x = [3, 0, 1, 2, 2, 0, 1, 3, 3, 3, 4, 1, 4, 3, 0]
y = [1, 0, 4, 3, 2, 1, 4, 0, 3, 0, 4, 2, 3, 3, 1]
import matplotlib.pyplot as plt
import numpy as np
x = np.array(x)
y = np.array(y)
hist, xbins,ybins = np.histogram2d(y,x, bins=range(6))
X,Y = np.meshgrid(xbins[:-1], ybins[:-1])
X = X[hist != 0]; Y = Y[hist != 0]
Z = hist[hist != 0]
fig, ax = plt.subplots()
ax.scatter(x,y, s=49, alpha=0.4)
for i in range(len(Z)):
ax.annotate(str(int(Z[i])), xy=(X[i],Y[i]), xytext=(4,0),
textcoords="offset points" )
plt.show()
You may then decide not to plot all points but the result from the histogramming which offers the chance to change the color and size of the scatter points,
ax.scatter(X,Y, s=(Z*20)**1.4, c = Z/Z.max(), cmap="winter_r", alpha=0.4)
Since all values are integers, you may also opt for an image plot,
fig, ax = plt.subplots()
ax.imshow(hist, cmap="PuRd")
for i in range(len(Z)):
ax.annotate(str(int(Z[i])), xy=(X[i],Y[i]), xytext=(0,0), color="w",
ha="center", va="center", textcoords="offset points" )
Without the necesity to calculate the number of occurances, another option is to use a hexbin plot. This gives slightly inaccurate positions of the dots, du to the hexagonal binning, but I still wanted to mention this option.
import matplotlib.pyplot as plt
import matplotlib.colors
import numpy as np
x = np.array(x)
y = np.array(y)
fig, ax = plt.subplots()
cmap = plt.cm.PuRd
cmaplist = [cmap(i) for i in range(cmap.N)]
cmaplist[0] = (1.0,1.0,1.0,1.0)
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('mcm',cmaplist, cmap.N)
ax.hexbin(x,y, gridsize=20, cmap=cmap, linewidth=0 )
plt.show()
I'm using matplotlib to plot data (using plot and errorbar functions) from Python. I have to plot a set of totally separate and independent plots, and then adjust their ylim values so they can be easily visually compared.
How can I retrieve the ylim values from each plot, so that I can take the min and max of the lower and upper ylim values, respectively, and adjust the plots so they can be visually compared?
Of course, I could just analyze the data and come up with my own custom ylim values... but I'd like to use matplotlib to do that for me. Any suggestions on how to easily (and efficiently) do this?
Here's my Python function that plots using matplotlib:
import matplotlib.pyplot as plt
def myplotfunction(title, values, errors, plot_file_name):
# plot errorbars
indices = range(0, len(values))
fig = plt.figure()
plt.errorbar(tuple(indices), tuple(values), tuple(errors), marker='.')
# axes
axes = plt.gca()
axes.set_xlim([-0.5, len(values) - 0.5])
axes.set_xlabel('My x-axis title')
axes.set_ylabel('My y-axis title')
# title
plt.title(title)
# save as file
plt.savefig(plot_file_name)
# close figure
plt.close(fig)
Just use axes.get_ylim(), it is very similar to set_ylim. From the docs:
get_ylim()
Get the y-axis range [bottom, top]
ymin, ymax = axes.get_ylim()
If you are using the plt api directly, you can avoid calls to axes altogether:
def myplotfunction(title, values, errors, plot_file_name):
# plot errorbars
indices = range(0, len(values))
fig = plt.figure()
plt.errorbar(tuple(indices), tuple(values), tuple(errors), marker='.')
plt.ylim([-0.5, len(values) - 0.5])
plt.xlabel('My x-axis title')
plt.ylabel('My y-axis title')
# title
plt.title(title)
# save as file
plt.savefig(plot_file_name)
# close figure
plt.close(fig)
Leveraging from the good answers above and assuming you were only using plt as in
import matplotlib.pyplot as plt
then you can get all four plot limits using plt.axis() as in the following example.
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5, 6, 7, 8] # fake data
y = [1, 2, 3, 4, 3, 2, 5, 6]
plt.plot(x, y, 'k')
xmin, xmax, ymin, ymax = plt.axis()
s = 'xmin = ' + str(round(xmin, 2)) + ', ' + \
'xmax = ' + str(xmax) + '\n' + \
'ymin = ' + str(ymin) + ', ' + \
'ymax = ' + str(ymax) + ' '
plt.annotate(s, (1, 5))
plt.show()
The above code should produce the following output plot.
Just use plt.ylim(), it can be used to set or get the min and max limit
ymin, ymax = plt.ylim()
I put above-mentioned methods together using ax instead of plt
import numpy as np
import matplotlib.pyplot as plt
x = range(100)
y = x
fig, ax = plt.subplots(1, 1, figsize=(7.2, 7.2))
ax.plot(x, y);
# method 1
print(ax.get_xlim())
print(ax.get_xlim())
# method 2
print(ax.axis())
It's an old question, but I don't see mentioned that, depending on the details, the sharey option may be able to do all of this for you, instead of digging up axis limits, margins, etc. There's a demo in the docs that shows how to use sharex, but the same can be done with y-axes.