how to create a column of dots in python [duplicate] - python

I'd like to create what my statistics book calls a "dot plot" where the number of dots in the plot equals the number of observations. Here's an example from mathisfun.com:
In the example, there are six dots above the 0 value on the X-axis representing the six observations of the value zero.
It seems that a "dot plot" can have several variations. In looking up how to create this with Matplotlib, I only came across what I know of as a scatter plot with a data point representing the relationship between the X and Y value.
Is the type of plot I'm trying to create possible with Matplotlib?

Supoose you have some data that would produce a histogram like the following,
import numpy as np; np.random.seed(13)
import matplotlib.pyplot as plt
data = np.random.randint(0,12,size=72)
plt.hist(data, bins=np.arange(13)-0.5, ec="k")
plt.show()
You may create your dot plot by calculating the histogram and plotting a scatter plot of all possible points, the color of the points being white if they exceed the number given by the histogram.
import numpy as np; np.random.seed(13)
import matplotlib.pyplot as plt
data = np.random.randint(0,12,size=72)
bins = np.arange(13)-0.5
hist, edges = np.histogram(data, bins=bins)
y = np.arange(1,hist.max()+1)
x = np.arange(12)
X,Y = np.meshgrid(x,y)
plt.scatter(X,Y, c=Y<=hist, cmap="Greys")
plt.show()
Alternatively you may set the unwanted points to nan,
Y = Y.astype(np.float)
Y[Y>hist] = np.nan
plt.scatter(X,Y)

This answer is built on the code posted by eyllanesc in his comment to the question as I find it elegant enough to merit an illustrative example. I provide two versions: a simple one where formatting parameters have been set manually and a second version where some of the formatting parameters are set automatically based on the data.
Simple version with manual formatting
import numpy as np # v 1.19.2
import matplotlib.pyplot as plt # v 3.3.2
# Create random data
rng = np.random.default_rng(123) # random number generator
data = rng.integers(0, 13, size=40)
values, counts = np.unique(data, return_counts=True)
# Draw dot plot with appropriate figure size, marker size and y-axis limits
fig, ax = plt.subplots(figsize=(6, 2.25))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), 'co', ms=10, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=8, labelsize=12)
plt.show()
Advanced version with automated formatting
If you plan on using this plot quite often, it can be useful to add some automated formatting parameters to get appropriate figure dimensions and marker size. In the following example, the parameters are defined in a way that works best with the kind of data for which this type of plot is typically useful (integer data with a range of up to a few dozen units and no more than a few hundred data points).
# Create random data
rng = np.random.default_rng(1) # random number generator
data = rng.integers(0, 21, size=100)
values, counts = np.unique(data, return_counts=True)
# Set formatting parameters based on data
data_range = max(values)-min(values)
width = data_range/2 if data_range<30 else 15
height = max(counts)/3 if data_range<50 else max(counts)/4
marker_size = 10 if data_range<50 else np.ceil(30/(data_range//10))
# Create dot plot with appropriate format
fig, ax = plt.subplots(figsize=(width, height))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), marker='o', color='tab:blue',
ms=marker_size, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=10)
plt.show()

Pass your dataset to this function:
def dot_diagram(dataset):
values, counts = np.unique(dataset, return_counts=True)
data_range = max(values)-min(values)
width = data_range/2 if data_range<30 else 15
height = max(counts)/3 if data_range<50 else max(counts)/4
marker_size = 10 if data_range<50 else np.ceil(30/(data_range//10))
fig, ax = plt.subplots(figsize=(width, height))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), marker='o', color='tab:blue',
ms=marker_size, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=10)

Let's say this is my data:
data = [5,8,3,7,1,5,3,2,3,3,8,5]
In order to plot a "dot plot", I will need the data (x-axis) and frequency (y-axis)
pos = []
keys = {} # this dict will help to keep track ...
# this loop will give us a list of frequencies to each number
for num in data:
if num not in keys:
keys[num] = 1
pos.append(1)
else:
keys[num] += 1
apos.append(keys[num])
print(pos)
[1, 1, 1, 1, 1, 2, 2, 1, 3, 4, 2, 3]
plt.scatter(data, pos)
plt.show()

Recently, I have also come up with something like this. And I have made the following for my case.
Hope this is helpful.
Well, we will first generate the frequency table and then we will generate points from that to do a scatter plot. Thats all! Superb simple.
For example, in your case, we have for 0 minutes, 6 people. This frequency can be converted into
[(0,1),(0,2),(0,3),(0,4),(0,5),(0,6)]
Then, these points has to be simply plotted using the pyplot.scatter.
import numpy as np
import matplotlib.pyplot as plt
def generate_points_for_dotplot(arr):
freq = np.unique(arr,return_counts=True)
ls = []
for (value, count) in zip(freq[0],freq[1]):
ls += [(value,num) for num in range(count)]
x = [x for (x,y) in ls]
y = [y for (x,y) in ls]
return np.array([x,y])
Of course, this function return an array of two arrays, one for x co-ordinates and the other for y co-ordinates (Just because, thats how pyplot needs the points!). Now, we have the function to generate the points required to us, let us plot it then.
arr = np.random.randint(1,21,size=100)
x,y = generate_points_for_dotplot(arr)
# Plotting
fig,ax = plt.subplots(figsize = (max(x)/3,3)) # feel free to use Patricks answer to make it more dynamic
ax.scatter(x,y,s=100,facecolors='none',edgecolors='black')
ax.set_xticks(np.unique(x))
ax.yaxis.set_visible(False)
# removing the spines
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
plt.show()
Output:
Probably, if the x ticks becomes over whelming, you can rotate them. However, for more number of values, that also becomes clumsy.

Related

Remove the x-axis spikes with empty labels in matplotlib

I have a plot in which I have to divide my data points into several groups, so I made customized sticks for this plot.
For instance, I have to group data points into multiples of 12, this is what I did
my_xticks = []
for x_ele in range(len(all_points)):
if x_ele % 12 == 0:
my_xticks.append(x_ele//12 + 1)
else:
my_xticks.append('')
ax.set_xticks(range(len(my_xticks)))
ax.set_xticklabels(my_xticks)
And the x-axis of the plot looks as
However, I wish to remove those spikes with empty labels, as circled in red
So the final x-axis could look like
Any idea? Thanks!
You didn't provide any data so i solved this by using some data i created. the idea is to use the range function to create the same gap between each tick.
Here is my code:
from matplotlib import pyplot as plt
import numpy as np
# create sample data
x = np.linspace(1, 60, 100)
y = x*x
# define the space of ticks
space = 12
# get minimum x value
min_val = int(min(x))
# get maximum x value
max_val = int(max(x))
# define our ticks
xticks = list(range(min_val, max_val, space))
# define labels for each tick
xticklabels = list(range(1, len(xticks) + 1, 1))
# create plot
fig, ax = plt.subplots()
ax.plot(x, y)
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
plt.show()
And output:

Forcing axis labels - How to include x,y inputs for a Z matrix to have sensible axes

I have a Z matrix and when I plug it into pcolormesh, it works perfectly and gives me the following plot. The only problem is that the axes now displays the matrix indices. The code that I used make it is given below:
#boo - most of the parameters like title,xyz labels, filename comes from command line
data = np.loadtxt((args.data),dtype=float, comments="#")
cmap = plt.get_cmap('bwr')
fig, ax0 = plt.subplots()
divnorm = colors.DivergingNorm(vmin=np.amin(data), vcenter=0, vmax=np.amax(data))
im0 = ax0.pcolormesh(data,norm=divnorm, cmap=cmap)
fig.colorbar(im0,ax=ax0)
ax0.set_title(str(title))
plt.xlabel(str(xlabel))
plt.ylabel(str(ylabel))
filename = str(prefix) + "."+ str(fileformat)
plt.savefig(filename)
I wanted to rescale the x-axis by a factor of 0.1 (ended up doing it manually since I did not see a workaround) and set the y-axis to change with respect to another array (Note that: I'm not manipulating Z matrix instead I'm using a physically meaningful experimental value array - here, sortData - corresponding to matrix indices). I changed my code as follows - x axis and yaxis seem alright but my heatmap looks different. Can someone shine some light on this? Many Thanks
#foo
Data = np.loadtxt((args.data),dtype=float, comments="#")
sort = np.loadtxt((args.sortData),dtype=float, comments="#")
fig, ax0 = plt.subplots()
cmap = plt.get_cmap('bwr')
divnorm = colors.DivergingNorm(vmin=np.amin(Data), vcenter=0, vmax=np.amax(Data))
# im0 = ax0.pcolormesh(Data,norm=divnorm, cmap=cmap)
# ax0.set_xscale(1, "linear")
x = np.arange(0.0,10.6,0.1) # need to set the ticks manually
y = sort[:,1]
X,Y = np.meshgrid(x,y)
Z=z.reshape(len(y),len(x))
im0 = ax0.pcolormesh(X,Y,Data,norm=divnorm, cmap=cmap)#, extent=[x.min(), x.max(), y.min(), y.max()])
#im0 = ax0.pcolormesh(x,y,Data,norm=divnorm, cmap=cmap)#, extent=[x.min(), x.max(), y.min(), y.max()])
cbar = fig.colorbar(im0,ax=ax0)
if args.zlabel !=None:
cbar.ax.set_ylabel(str(args.zlabel))
ax0.set_title(str(args.title))
plt.xlabel(str(args.xlabel))
plt.ylabel(str(args.ylabel))
filename = str(args.prefix) + "."+ str(args.fileformat)
plt.savefig(filename)
EDIT 1:
When I plot the boo, y-axis is uniformly spaced since we are dealing with the matrix indices. When I plot foo, they are not since the array values corresponding to these indices ( not that of Data matrix but the external y array that is of same dim as Data but has values stored in it corresponding to expt) are not equally spaced. The problem is The y values corresponding to The first 5 y indexes of Data matrix are 1.32, 3.200, 3.311, 3.38, 3.40 and their x values change throughout the range [xmin to xmax]. But there's a giant blob of red thing between 0 and 5 (Y) that goes horizontally all the way till the end of xlim. Clearly something is wrong but can't figure out what it is.
I'm not 100% clear on what you're trying to do, but if you're trying to plot the data in Boo but with some different set of tick labels then I think a modification of the following self-contained example would probably work for you.
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
title = 'Foo'
xlabel = 'X'
ylabel = 'Y'
rv = np.random.rand(100) # uniform random vector
data = rv[:,None] - rv.T # pairwise diffs
cmap = plt.get_cmap('bwr')
fig, ax = plt.subplots()
divnorm = colors.DivergingNorm(vmin=np.amin(data), vcenter=0, vmax=np.amax(data))
im0 = ax.pcolormesh(data, norm=divnorm, cmap=cmap)
fig.colorbar(im0, ax=ax)
# do tick labeling stuff here
nticks = 5
x_tick_pos = np.linspace(0,100,nticks)
y_tick_pos = np.linspace(0,100,nticks)
ax.set_xticks(x_tick_pos)
ax.set_yticks(y_tick_pos)
xtick_labels = [str(x) for x in np.linspace(0, 10, nticks)] # can be any list of strings
ytick_labels = [str(y) for y in np.linspace(0, 10, nticks)] # len must match nticks
ax.set_xticklabels(xtick_labels)
ax.set_yticklabels(ytick_labels)
ax.set_title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
Note that if you want to do fancier things, like have the tick labels rotated so that they can be easier to read, you might be aided by checking out the matplotlib tutorial on labeling heatmaps.

How to create a "dot plot" in Matplotlib? (not a scatter plot)

I'd like to create what my statistics book calls a "dot plot" where the number of dots in the plot equals the number of observations. Here's an example from mathisfun.com:
In the example, there are six dots above the 0 value on the X-axis representing the six observations of the value zero.
It seems that a "dot plot" can have several variations. In looking up how to create this with Matplotlib, I only came across what I know of as a scatter plot with a data point representing the relationship between the X and Y value.
Is the type of plot I'm trying to create possible with Matplotlib?
Supoose you have some data that would produce a histogram like the following,
import numpy as np; np.random.seed(13)
import matplotlib.pyplot as plt
data = np.random.randint(0,12,size=72)
plt.hist(data, bins=np.arange(13)-0.5, ec="k")
plt.show()
You may create your dot plot by calculating the histogram and plotting a scatter plot of all possible points, the color of the points being white if they exceed the number given by the histogram.
import numpy as np; np.random.seed(13)
import matplotlib.pyplot as plt
data = np.random.randint(0,12,size=72)
bins = np.arange(13)-0.5
hist, edges = np.histogram(data, bins=bins)
y = np.arange(1,hist.max()+1)
x = np.arange(12)
X,Y = np.meshgrid(x,y)
plt.scatter(X,Y, c=Y<=hist, cmap="Greys")
plt.show()
Alternatively you may set the unwanted points to nan,
Y = Y.astype(np.float)
Y[Y>hist] = np.nan
plt.scatter(X,Y)
This answer is built on the code posted by eyllanesc in his comment to the question as I find it elegant enough to merit an illustrative example. I provide two versions: a simple one where formatting parameters have been set manually and a second version where some of the formatting parameters are set automatically based on the data.
Simple version with manual formatting
import numpy as np # v 1.19.2
import matplotlib.pyplot as plt # v 3.3.2
# Create random data
rng = np.random.default_rng(123) # random number generator
data = rng.integers(0, 13, size=40)
values, counts = np.unique(data, return_counts=True)
# Draw dot plot with appropriate figure size, marker size and y-axis limits
fig, ax = plt.subplots(figsize=(6, 2.25))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), 'co', ms=10, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=8, labelsize=12)
plt.show()
Advanced version with automated formatting
If you plan on using this plot quite often, it can be useful to add some automated formatting parameters to get appropriate figure dimensions and marker size. In the following example, the parameters are defined in a way that works best with the kind of data for which this type of plot is typically useful (integer data with a range of up to a few dozen units and no more than a few hundred data points).
# Create random data
rng = np.random.default_rng(1) # random number generator
data = rng.integers(0, 21, size=100)
values, counts = np.unique(data, return_counts=True)
# Set formatting parameters based on data
data_range = max(values)-min(values)
width = data_range/2 if data_range<30 else 15
height = max(counts)/3 if data_range<50 else max(counts)/4
marker_size = 10 if data_range<50 else np.ceil(30/(data_range//10))
# Create dot plot with appropriate format
fig, ax = plt.subplots(figsize=(width, height))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), marker='o', color='tab:blue',
ms=marker_size, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=10)
plt.show()
Pass your dataset to this function:
def dot_diagram(dataset):
values, counts = np.unique(dataset, return_counts=True)
data_range = max(values)-min(values)
width = data_range/2 if data_range<30 else 15
height = max(counts)/3 if data_range<50 else max(counts)/4
marker_size = 10 if data_range<50 else np.ceil(30/(data_range//10))
fig, ax = plt.subplots(figsize=(width, height))
for value, count in zip(values, counts):
ax.plot([value]*count, list(range(count)), marker='o', color='tab:blue',
ms=marker_size, linestyle='')
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
ax.yaxis.set_visible(False)
ax.set_ylim(-1, max(counts))
ax.set_xticks(range(min(values), max(values)+1))
ax.tick_params(axis='x', length=0, pad=10)
Let's say this is my data:
data = [5,8,3,7,1,5,3,2,3,3,8,5]
In order to plot a "dot plot", I will need the data (x-axis) and frequency (y-axis)
pos = []
keys = {} # this dict will help to keep track ...
# this loop will give us a list of frequencies to each number
for num in data:
if num not in keys:
keys[num] = 1
pos.append(1)
else:
keys[num] += 1
apos.append(keys[num])
print(pos)
[1, 1, 1, 1, 1, 2, 2, 1, 3, 4, 2, 3]
plt.scatter(data, pos)
plt.show()
Recently, I have also come up with something like this. And I have made the following for my case.
Hope this is helpful.
Well, we will first generate the frequency table and then we will generate points from that to do a scatter plot. Thats all! Superb simple.
For example, in your case, we have for 0 minutes, 6 people. This frequency can be converted into
[(0,1),(0,2),(0,3),(0,4),(0,5),(0,6)]
Then, these points has to be simply plotted using the pyplot.scatter.
import numpy as np
import matplotlib.pyplot as plt
def generate_points_for_dotplot(arr):
freq = np.unique(arr,return_counts=True)
ls = []
for (value, count) in zip(freq[0],freq[1]):
ls += [(value,num) for num in range(count)]
x = [x for (x,y) in ls]
y = [y for (x,y) in ls]
return np.array([x,y])
Of course, this function return an array of two arrays, one for x co-ordinates and the other for y co-ordinates (Just because, thats how pyplot needs the points!). Now, we have the function to generate the points required to us, let us plot it then.
arr = np.random.randint(1,21,size=100)
x,y = generate_points_for_dotplot(arr)
# Plotting
fig,ax = plt.subplots(figsize = (max(x)/3,3)) # feel free to use Patricks answer to make it more dynamic
ax.scatter(x,y,s=100,facecolors='none',edgecolors='black')
ax.set_xticks(np.unique(x))
ax.yaxis.set_visible(False)
# removing the spines
for spine in ['top', 'right', 'left']:
ax.spines[spine].set_visible(False)
plt.show()
Output:
Probably, if the x ticks becomes over whelming, you can rotate them. However, for more number of values, that also becomes clumsy.

How to make pyplot have evenly spread y ticks with values [0, 1/2, 3/4, 7/8, ...]

I would like to compare a few algorithms by a graph of their convergence probability curves.
Currently, my graph looks as follows:
which does not allow to see the difference in many of the curves.
I want to have y axis to be "logarithmic", but in its difference from the value 1, i.e. I want the y values to be [0, 1/2, 3/4, 7/8, 15/16, ... 1023/1024], but so every tick would still have the same distance from the last (i.e., the distance from 1/2 to 3/4 is the same as the one from 15/16 to 31/32).
I've tried using the yticks() function, but it doesn't place the ticks evenly:
How do I make this axis look right?
My current code:
def plotCDFs(CDFs, names = []):
legend = []
for i, CDF in enumerate(CDFs):
keys = sorted(CDF)
vals = sorted(CDF.values())
plt.plot(keys,vals)
legend.append(str(names[i]))
plt.title('Cumulative Distribution')
plt.legend(legend, loc='lower right')
plt.xscale('log')
plt.gca().set_ylim([0,1])
#plt.yticks([1-2**-i for i in xrange(11)])
plt.show()
There are two possibilities: You can plot 1-cumulative Distribution in an ordinary log-log plot which is what I usually do or you (probably) have to create your own log-plot as you describe above. At least I have never seen a builtin function which achieves this.
This code should work
import numpy as np
import matplotlib.pyplot as plt
def ToLog(x):
return 1.-np.log10(1.-x)
def plotCDFs(CDFs, names = []):
legend = []
max_vals = 0.0
for i, CDF in enumerate(CDFs):
keys = sorted(CDF)
vals = sorted(CDF.values())
if vals.max() > max_vals:
max_vals = vals
plt.plot(keys,ToLog(vals))
legend.append(str(names[i]))
plt.title('Cumulative Distribution')
plt.legend(legend, loc='lower right')
plt.xscale('log')
# handling the yaxis ticks and ticklabels
i_max = np.floor(np.log(1-max_vals.max())/np.log(2.))
yticks = 1.-2.**np.linspace(i_max,0,-i_max+1)
ax = plt.gca()
ax.set_yticks(1.-np.log10(1.-yticks))
ax.set_yticklabels([str(i-1)+'/'+str(i) for i in 2**np.arange(-int(i_max),0,-1)])
ax.set_ylim([0,1])
plt.show()
Note that ToLog must be applied on all ydata before plotting.

Is there a function to make scatterplot matrices in matplotlib?

Example of scatterplot matrix
Is there such a function in matplotlib.pyplot?
For those who do not want to define their own functions, there is a great data analysis libarary in Python, called Pandas, where one can find the scatter_matrix() method:
from pandas.plotting import scatter_matrix
df = pd.DataFrame(np.random.randn(1000, 4), columns = ['a', 'b', 'c', 'd'])
scatter_matrix(df, alpha = 0.2, figsize = (6, 6), diagonal = 'kde')
Generally speaking, matplotlib doesn't usually contain plotting functions that operate on more than one axes object (subplot, in this case). The expectation is that you'd write a simple function to string things together however you'd like.
I'm not quite sure what your data looks like, but it's quite simple to just build a function to do this from scratch. If you're always going to be working with structured or rec arrays, then you can simplify this a touch. (i.e. There's always a name associated with each data series, so you can omit having to specify names.)
As an example:
import itertools
import numpy as np
import matplotlib.pyplot as plt
def main():
np.random.seed(1977)
numvars, numdata = 4, 10
data = 10 * np.random.random((numvars, numdata))
fig = scatterplot_matrix(data, ['mpg', 'disp', 'drat', 'wt'],
linestyle='none', marker='o', color='black', mfc='none')
fig.suptitle('Simple Scatterplot Matrix')
plt.show()
def scatterplot_matrix(data, names, **kwargs):
"""Plots a scatterplot matrix of subplots. Each row of "data" is plotted
against other rows, resulting in a nrows by nrows grid of subplots with the
diagonal subplots labeled with "names". Additional keyword arguments are
passed on to matplotlib's "plot" command. Returns the matplotlib figure
object containg the subplot grid."""
numvars, numdata = data.shape
fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for ax in axes.flat:
# Hide all ticks and labels
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
# Set up ticks only on one side for the "edge" subplots...
if ax.is_first_col():
ax.yaxis.set_ticks_position('left')
if ax.is_last_col():
ax.yaxis.set_ticks_position('right')
if ax.is_first_row():
ax.xaxis.set_ticks_position('top')
if ax.is_last_row():
ax.xaxis.set_ticks_position('bottom')
# Plot the data.
for i, j in zip(*np.triu_indices_from(axes, k=1)):
for x, y in [(i,j), (j,i)]:
axes[x,y].plot(data[x], data[y], **kwargs)
# Label the diagonal subplots...
for i, label in enumerate(names):
axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
ha='center', va='center')
# Turn on the proper x or y axes ticks.
for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
axes[j,i].xaxis.set_visible(True)
axes[i,j].yaxis.set_visible(True)
return fig
main()
You can also use Seaborn's pairplot function:
import seaborn as sns
sns.set()
df = sns.load_dataset("iris")
sns.pairplot(df, hue="species")
Thanks for sharing your code! You figured out all the hard stuff for us. As I was working with it, I noticed a few little things that didn't look quite right.
[FIX #1] The axis tics weren't lining up like I would expect (i.e., in your example above, you should be able to draw a vertical and horizontal line through any point across all plots and the lines should cross through the corresponding point in the other plots, but as it sits now this doesn't occur.
[FIX #2] If you have an odd number of variables you are plotting with, the bottom right corner axes doesn't pull the correct xtics or ytics. It just leaves it as the default 0..1 ticks.
Not a fix, but I made it optional to explicitly input names, so that it puts a default xi for variable i in the diagonal positions.
Below you'll find an updated version of your code that addresses these two points, otherwise preserving the beauty of your code.
import itertools
import numpy as np
import matplotlib.pyplot as plt
def scatterplot_matrix(data, names=[], **kwargs):
"""
Plots a scatterplot matrix of subplots. Each row of "data" is plotted
against other rows, resulting in a nrows by nrows grid of subplots with the
diagonal subplots labeled with "names". Additional keyword arguments are
passed on to matplotlib's "plot" command. Returns the matplotlib figure
object containg the subplot grid.
"""
numvars, numdata = data.shape
fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
fig.subplots_adjust(hspace=0.0, wspace=0.0)
for ax in axes.flat:
# Hide all ticks and labels
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
# Set up ticks only on one side for the "edge" subplots...
if ax.is_first_col():
ax.yaxis.set_ticks_position('left')
if ax.is_last_col():
ax.yaxis.set_ticks_position('right')
if ax.is_first_row():
ax.xaxis.set_ticks_position('top')
if ax.is_last_row():
ax.xaxis.set_ticks_position('bottom')
# Plot the data.
for i, j in zip(*np.triu_indices_from(axes, k=1)):
for x, y in [(i,j), (j,i)]:
# FIX #1: this needed to be changed from ...(data[x], data[y],...)
axes[x,y].plot(data[y], data[x], **kwargs)
# Label the diagonal subplots...
if not names:
names = ['x'+str(i) for i in range(numvars)]
for i, label in enumerate(names):
axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
ha='center', va='center')
# Turn on the proper x or y axes ticks.
for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
axes[j,i].xaxis.set_visible(True)
axes[i,j].yaxis.set_visible(True)
# FIX #2: if numvars is odd, the bottom right corner plot doesn't have the
# correct axes limits, so we pull them from other axes
if numvars%2:
xlimits = axes[0,-1].get_xlim()
ylimits = axes[-1,0].get_ylim()
axes[-1,-1].set_xlim(xlimits)
axes[-1,-1].set_ylim(ylimits)
return fig
if __name__=='__main__':
np.random.seed(1977)
numvars, numdata = 4, 10
data = 10 * np.random.random((numvars, numdata))
fig = scatterplot_matrix(data, ['mpg', 'disp', 'drat', 'wt'],
linestyle='none', marker='o', color='black', mfc='none')
fig.suptitle('Simple Scatterplot Matrix')
plt.show()
Thanks again for sharing this with us. I have used it many times! Oh, and I re-arranged the main() part of the code so that it can be a formal example code or not get called if it is being imported into another piece of code.
While reading the question I expected to see an answer including rpy. I think this is a nice option taking advantage of two beautiful languages. So here it is:
import rpy
import numpy as np
def main():
np.random.seed(1977)
numvars, numdata = 4, 10
data = 10 * np.random.random((numvars, numdata))
mpg = data[0,:]
disp = data[1,:]
drat = data[2,:]
wt = data[3,:]
rpy.set_default_mode(rpy.NO_CONVERSION)
R_data = rpy.r.data_frame(mpg=mpg,disp=disp,drat=drat,wt=wt)
# Figure saved as eps
rpy.r.postscript('pairsPlot.eps')
rpy.r.pairs(R_data,
main="Simple Scatterplot Matrix Via RPy")
rpy.r.dev_off()
# Figure saved as png
rpy.r.png('pairsPlot.png')
rpy.r.pairs(R_data,
main="Simple Scatterplot Matrix Via RPy")
rpy.r.dev_off()
rpy.set_default_mode(rpy.BASIC_CONVERSION)
if __name__ == '__main__': main()
I can't post an image to show the result :( sorry!

Categories