I use a bar graph to indicate the data of each group. Some of these bars differ significantly from each other. How can I indicate the significant difference in the bar plot?
import numpy as np
import matplotlib.pyplot as plt
menMeans = (5, 15, 30, 40)
menStd = (2, 3, 4, 5)
ind = np.arange(4) # the x locations for the groups
width=0.35
p1 = plt.bar(ind, menMeans, width=width, color='r', yerr=menStd)
plt.xticks(ind+width/2., ('A', 'B', 'C', 'D') )
I am aiming for
The answer above inspired me to write a small but flexible function myself:
def barplot_annotate_brackets(num1, num2, data, center, height, yerr=None, dh=.05, barh=.05, fs=None, maxasterix=None):
"""
Annotate barplot with p-values.
:param num1: number of left bar to put bracket over
:param num2: number of right bar to put bracket over
:param data: string to write or number for generating asterixes
:param center: centers of all bars (like plt.bar() input)
:param height: heights of all bars (like plt.bar() input)
:param yerr: yerrs of all bars (like plt.bar() input)
:param dh: height offset over bar / bar + yerr in axes coordinates (0 to 1)
:param barh: bar height in axes coordinates (0 to 1)
:param fs: font size
:param maxasterix: maximum number of asterixes to write (for very small p-values)
"""
if type(data) is str:
text = data
else:
# * is p < 0.05
# ** is p < 0.005
# *** is p < 0.0005
# etc.
text = ''
p = .05
while data < p:
text += '*'
p /= 10.
if maxasterix and len(text) == maxasterix:
break
if len(text) == 0:
text = 'n. s.'
lx, ly = center[num1], height[num1]
rx, ry = center[num2], height[num2]
if yerr:
ly += yerr[num1]
ry += yerr[num2]
ax_y0, ax_y1 = plt.gca().get_ylim()
dh *= (ax_y1 - ax_y0)
barh *= (ax_y1 - ax_y0)
y = max(ly, ry) + dh
barx = [lx, lx, rx, rx]
bary = [y, y+barh, y+barh, y]
mid = ((lx+rx)/2, y+barh)
plt.plot(barx, bary, c='black')
kwargs = dict(ha='center', va='bottom')
if fs is not None:
kwargs['fontsize'] = fs
plt.text(*mid, text, **kwargs)
which allows me to get some nice annotations relatively simple, e.g.:
heights = [1.8, 2, 3]
bars = np.arange(len(heights))
plt.figure()
plt.bar(bars, heights, align='center')
plt.ylim(0, 5)
barplot_annotate_brackets(0, 1, .1, bars, heights)
barplot_annotate_brackets(1, 2, .001, bars, heights)
barplot_annotate_brackets(0, 2, 'p < 0.0075', bars, heights, dh=.2)
I've done a couple of things here that I suggest when working with complex plots. Pull out the custom formatting into a dictionary, it makes life simple when you want to change a parameter - and you can pass this dictionary to multiple plots. I've also written a custom function to annotate the itervalues, as a bonus it can annotate between (A,C) if you really want to (I stand by my comment that this isn't the right visual approach however). It may need some tweaking once the data changes but this should put you on the right track.
import numpy as np
import matplotlib.pyplot as plt
menMeans = (5, 15, 30, 40)
menStd = (2, 3, 4, 5)
ind = np.arange(4) # the x locations for the groups
width= 0.7
labels = ('A', 'B', 'C', 'D')
# Pull the formatting out here
bar_kwargs = {'width':width,'color':'y','linewidth':2,'zorder':5}
err_kwargs = {'zorder':0,'fmt':None,'linewidth':2,'ecolor':'k'} #for matplotlib >= v1.4 use 'fmt':'none' instead
fig, ax = plt.subplots()
ax.p1 = plt.bar(ind, menMeans, **bar_kwargs)
ax.errs = plt.errorbar(ind, menMeans, yerr=menStd, **err_kwargs)
# Custom function to draw the diff bars
def label_diff(i,j,text,X,Y):
x = (X[i]+X[j])/2
y = 1.1*max(Y[i], Y[j])
dx = abs(X[i]-X[j])
props = {'connectionstyle':'bar','arrowstyle':'-',\
'shrinkA':20,'shrinkB':20,'linewidth':2}
ax.annotate(text, xy=(X[i],y+7), zorder=10)
ax.annotate('', xy=(X[i],y), xytext=(X[j],y), arrowprops=props)
# Call the function
label_diff(0,1,'p=0.0370',ind,menMeans)
label_diff(1,2,'p<0.0001',ind,menMeans)
label_diff(2,3,'p=0.0025',ind,menMeans)
plt.ylim(ymax=60)
plt.xticks(ind, labels, color='k')
plt.show()
If you are using matplotlib and seeking boxplot annotation, use my code as a function:
statistical annotation
def AnnoMe(x1, x2, ARRAY, TXT):
y, h, col = max(max(ARRAY[x1-1]),max(ARRAY[x2-1])) + 2, 2, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, TXT, ha='center', va='bottom', color=col)
where 'x1' and 'x2' are two columns you want to compare, 'ARRAY' is the list of lists you are using for illustrating the boxplot. And, 'TXT' is your text like p-value or significant/not significant in string format.
Accordingly, call it with:
AnnoMe(1, 2, MyArray, "p-value=0.02")
Grouped bar plot from pandas dataframe
Annotate significant difference between bars
I have modified the solution of #cheersmate in order to receive in input also pandas dataframes. This function is tested with matplotlib 3.5.1
def annotate_barplot_dataframe(bar0, bar1, text, patches, dh=0.2):
"""Annotate a grouped barplot from a pandas dataframe
An annotation is added to the figure from bar0 to bar1
Args:
bar0 (int): index of first bar
bar1 (int): index of second bar
text (string): what to write on the annotation
patches (matplotlib.patches): data source
df (float): height of the annotation bar
"""
patches.sort(key=lambda x: x.xy[0])
left = patches[bar0]
right = patches[bar1]
y = max(left._height, right._height) + dh
l_bbox = left.get_bbox()
l_mid = l_bbox.x1 - left._width / 2
r_bbox = right.get_bbox()
r_mid = r_bbox.x1 - right._width / 2
barh = 0.07
# lower-left, upper-left, upper-right, lower-right
barx = [l_mid, l_mid, r_mid, r_mid]
bary = [
y,
y + barh,
y + barh,
y,
]
plt.plot(barx, bary, c="black")
kwargs = dict(ha="center", va="bottom")
mid = ((l_mid + r_mid) / 2, y + 0.01)
plt.text(*mid, text, **kwargs)
def prepare_df(filename):
"""load filename is exists and prepare it for the plot
Args:
filename (string): must be a .xlsx file
Returns:
pandas.df: grouped dataframe
"""
assert filename.endswith("xlsx"), "Check file extension"
try:
df = pd.read_excel(filename, sheet_name=0, usecols="H:W", engine="openpyxl")
except Exception as e:
raise ValueError(e)
# Columnkey is the variable by which we want to group
# e.g. in this example columnskey's entries have 3 different values
grouped = df.groupby(df["Columnkey"])
df_group1 = grouped.get_group(1)
df_group2 = grouped.get_group(2)
df_group3 = grouped.get_group(3)
g = pd.concat(
[
df_group1.mean().rename("C1"),
df_group2.mean().rename("C2"),
df_group3.mean().rename("C3"),
],
axis=1,
)
return g
So the input to the function should look something like this.
if __name__ == "__main__":
filename = "Data.xlsx"
dataframe = prepare_df(filename)
width = 0.7
ax = dataframe.plot.bar(width=width, figsize=(9, 2))
# this plot will group in sets of 3
patches = ax.patches._axes.axes.containers[0].patches
patches.extend(ax.patches._axes.axes.containers[1].patches)
patches.extend(ax.patches._axes.axes.containers[2].patches)
annotate_barplot_dataframe(0, 1, "*", patches, 0.1)
annotate_barplot_dataframe(1, 2, "*", patches, 0.1)
plt.savefig(fname="filename.pdf", bbox_inches="tight")
plt.show()
The outcome will save to disk a picture like
Related
I've got a script wherein I have two functions, makeplots() which makes a figure of blank subplots arranged in a particular way (depending on the number of subplots to be drawn), and drawplots() which is called later, drawing the plots (obviously). The functions are posted below.
The script does some analysis of data for a given number of 'targets' (which can number anywhere from one to nine) and creates plots of the linear regression for each target. When there are multiple targets, this works great. But when there's a single target (i.e. a single 'subplot' in the figure), the Y-axis label overlaps the axis itself (this does not happen when there are multiple targets).
Ideally, each subplot would be square, no labels would overlap, and it would work the same for one target as for multiple targets. But when I tried to decrease the size of the y-axis label and shift it over a bit, it appears that the actual axes object was drawn over the previously blank, square plot (whose axes ranged from 0 to 1), and the old tick mark labels are still visible. I'd like to have those old tick marks removed when calling drawplots(). I've tried changing the subplot_kw={} arguments in makeplots, as well as removing ax.set_aspect('auto') from drawplots, both to no avail. Note that there are also screenshots of various behaviors at the end, also.
def makeplots(targets, active=actwindow):
def rowcnt(y):
rownumb = y//3 if (y%3 == 0) else y//3+1
return rownumb
def colcnt(x):
if x <= 3: colnumb = x
elif x == 4: colnumb = 2
else: colnumb = 3
return colnumb
numsubs = len(targets)
numrow, numcol = rowcnt(numsubs), colcnt(numsubs)
if numsubs >= 1:
if numsubs == 1:
fig, axs = plt.subplots(num='LOD-95 Plots', nrows=1, ncols=1, figsize = [8,6], subplot_kw={'adjustable': 'box', 'aspect': 1})
# changed 'box' to 'datalim'
fig, axs = plt.subplots(num='LOD-95 Plots', nrows=numrow, ncols=numcol, figsize = [numcol*6,numrow*6], subplot_kw={'adjustable': 'box', 'aspect': 1})
fig.text(0.02, 0.5, 'Probit score\n $(\sigma + 5)$', va='center', rotation='vertical', size='16')
else:
raise ValueError(f'Error generating plots [call: makeplots({targets},{active}) - invalid numsubs value]')
axs = np.ravel(axs)
for i, ax in enumerate(axs):
ax.set_title(f'Limit of Detection: {targets[i]}', size=11)
ax.grid()
return fig, axs
and
def drawplots(ax, dftables, color1, color2):
y = dftables.probit
y95 = 6.6448536269514722
logreg = False
regfun = lambda m, x, b : (m*x) + b
regq = scipy.stats.linregress(dftables.qty,y)
regl = scipy.stats.linregress(dftables.log_qty,y)
if regq.rvalue**2 >= regl.rvalue**2:
regression = regq
x_label = 'input quantity'
x = dftables.qty
elif regq.rvalue**2 < regl.rvalue**2:
regression = regl
x_label = '$log_{10}$(input quantity)'
x = dftables.log_qty
logreg = True
slope, intercept, r = regression.slope, regression.intercept, regression.rvalue
r2 = r**2
lod = (y95-intercept)/slope
xr = [0, lod*1.2]
yr = [intercept, regfun(slope, xr[1], intercept)]
regeqn = "y = "+str(f"{slope:.2e}")+"x + "+str(f"{intercept:.3f}")
if logreg:
lodstr = f'log(LOD) = {lod:.2f}' if lod <= 100 else f'log(LOD) = {lod:.2e}'
elif not logreg:
lodstr = f'LOD = {lod:.2f}' if lod <= 100 else f'LOD = {lod:.2e}'
# raise ValueError(f'Error raised calling drawplots()')
ax.set_xlabel(x_label, fontweight='bold')
ax.plot(xr, yr, color=color1, linestyle='--') # plot regression line
ax.plot(lod, y95, marker='D', color=color2, markersize=7) # plot point for LoD
ax.plot(xr, [y95,y95], color=color2, linestyle=':') # horizontal crosshair
ax.plot([lod,lod],[0, 7.1], color=color2, linestyle=':') # vertical crosshair
ax.scatter(x, y, s=81, color=color1, marker='.') # actual data points
ax.annotate(f"{lodstr}", xy=(lod,0.1),
xytext=(0.9*lod,0.5), fontsize=8, arrowprops = dict(facecolor='black', headlength=5, width=2, headwidth=5))
ax.set_aspect('auto')
ax.set_xlim(left=0)
ax.set_ylim(bottom=0)
ax.plot()
if logreg: lod = 10 ** lod
return r2, lod, regeqn, logreg
The context they're called in:
fig, axs = makeplots(targets)
wg.SetForegroundWindow(actwindow)
with open(outName, 'a+') as f:
print(f"Lower Limit of Detection Analysis on {dt} at {tm}\n", file=f)
for i, tars in enumerate(targets):
data[tars] = stripThousands(data[tars])
# logans = checkyn(f"Analyze {tars} using log10(concentration/quantity)? (y/n): ")
for idx, val in enumerate(qtys):
tables[i,idx,2] = hitrate(val,data,tars)
tables[i,idx,3] = norm.ppf(tables[i,idx,2])+5
printtables[tars] = pd.DataFrame(tables[i,:,:], columns=["qty","log_qty","probability","probit"])
# construct dataframes from np.arrays and drop
# rows with infinite probit values:
dftables[tars] = pd.DataFrame(tables[i,:,:], columns=["qty","log_qty","probability","probit"])
dftables[tars].probit.replace([np.inf,-np.inf],np.nan, inplace=True)
dftables[tars].dropna(inplace=True)
r2, lod, eqn, logreg = drawplots(axs[i], dftables[tars], cbcolors[i], cbcolors[i+5])
You should clear the axes in each iteration using pyplot.cla().
You posted a lot of code, so I'm not 100% sure of the best location to place it in your code, but the general idea is to clear the axes before each new plot.
Here is a minimal demo without cla():
x = [[1,2,3], [3,2,1]]
fig, ax = plt.subplots()
for index, data in enumerate(x):
ax.plot(data)
And with cla():
for index, data in enumerate(x):
ax.cla()
ax.plot(data)
On Python, using the Pandas library, I'm trying to generate the scatter plot of a DataFrame using scatter_matrix as follows:
scatter_matrix(df, alpha=0.5, figsize=(14,14), diagonal='kde')
That program takes very long to run and eventually crashes, possibly because there are too many (26) columns, and the resulting image would be to big. Nether less, I noticed I'm able to render 13 variables just fine. That way, one solution would be to generate 4 plots instead, one for each quadrant of the resulting scatter matrix, i.e., the ranges [[0,0],[13,13]], [[13,0],[26,13]], [[0,13],[13,26]], [[13,13],[26,26]]. Notice those don't refer to ranges on the source DataFrame, but of the target scatter matrix I'm rendering. Is it possible?
I couldn't find any official way to do it, so I modified the scatter_matrix implementation to receive 2 additional params, cols and rows, which are arrays with the labels you want to compare:
"""
This is a modification of the scatter_matrix method;
it allows plotting sub-sections of the scatter plot
matrix. With the official method you can only plot
the entire matrix. This method allows selecting the
`cols` and `rows` you're interested in plotting.
Note that this wouldn't be possible even by calling
scatter_matrix on a subset of the dataframe, because
this wouldn't allow comparing different `cols`/`rows'.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types
def scatter_matrix(frame, cols, rows, figsize=None, ax=None, grid=False,
diagonal='hist', marker='.', density_kwds=None,
hist_kwds=None, range_padding=0.05, **kwds):
"""
Draw a matrix of scatter plots.
Parameters
----------
frame : DataFrame
cols : [str]
labels of the columns to be rendered
rows: [str]
labels of the rows to be rendered
alpha : float, optional
amount of transparency applied
figsize : (float,float), optional
a tuple (width, height) in inches
ax : Matplotlib axis object, optional
grid : bool, optional
setting this to True will show the grid
diagonal : {'hist', 'kde'}
pick between 'kde' and 'hist' for
either Kernel Density Estimation or Histogram
plot in the diagonal
marker : str, optional
Matplotlib marker type, default '.'
hist_kwds : other plotting keyword arguments
To be passed to hist function
density_kwds : other plotting keyword arguments
To be passed to kernel density estimate plot
range_padding : float, optional
relative extension of axis range in x and y
with respect to (x_max - x_min) or (y_max - y_min),
default 0.05
kwds : other plotting keyword arguments
To be passed to scatter function
Examples
--------
>>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
>>> scatter_matrix(df, alpha=0.2)
"""
import matplotlib.pyplot as plt
df = frame._get_numeric_data()
w = len(cols)
h = len(rows)
naxes = w * h
fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
#mask = pandas.tools.plotting.notnull(df)
marker = pandas.tools.plotting._get_marker_compat(marker)
hist_kwds = hist_kwds or {}
density_kwds = density_kwds or {}
# workaround because `c='b'` is hardcoded in matplotlibs scatter method
#kwds.setdefault('c', plt.rcParams['patch.facecolor'])
cols_boundaries_list = []
for a in cols:
values = df[a]#.values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
cols_boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
rows_boundaries_list = []
for a in rows:
values = df[a]#.values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
rows_boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
for i, a in zip(lrange(w), cols):
for j, b in zip(lrange(h), rows):
ax = axes[i, j]
if cols[i] == rows[j]:
values = df[a]#.values[mask[a].values]
# Deal with the diagonal by drawing a histogram there.
if diagonal == 'hist':
ax.hist(values, **hist_kwds)
elif diagonal in ('kde', 'density'):
from scipy.stats import gaussian_kde
y = values
gkde = gaussian_kde(y)
ind = np.linspace(y.min(), y.max(), 1000)
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
ax.set_xlim(cols_boundaries_list[i])
else:
#common = (mask[a] & mask[b]).values
ax.scatter(df[b], df[a], marker=marker, **kwds)
ax.set_xlim(rows_boundaries_list[j])
ax.set_ylim(cols_boundaries_list[i])
ax.set_xlabel(b)
ax.set_ylabel(a)
if j != 0:
ax.yaxis.set_visible(False)
if i != w - 1:
ax.xaxis.set_visible(False)
# what is that for?
#if len(df.columns) > 1:
#lim1 = cols_boundaries_list[0]
#locs = axes[0][1].yaxis.get_majorticklocs()
#locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
#adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
#lim0 = axes[0][0].get_ylim()
#adj = adj * (lim0[1] - lim0[0]) + lim0[0]
#axes[0][0].yaxis.set_ticks(adj)
#if np.all(locs == locs.astype(int)):
## if all ticks are int
#locs = locs.astype(int)
#axes[0][0].yaxis.set_ticklabels(locs)
pandas.tools.plotting._set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
return axes
I was quite careless modifying that code so it is probably broken but served my needs.
The 4 quadrants will be:
mid_ind = len(df.index)//2
mid_col = len(df.columns)//2
df.iloc[:mid_ind,:mid_col]
df.iloc[mid_ind:,:mid_col]
df.iloc[mid_ind:,mid_col:]
df.iloc[:mid_ind,mid_col:]
Working on a personal project that draws two lines, each dashed (ls='--') for the first two x-axis markings, then it is a solid line...thinking about writing a tutorial since I've found no information on this. Anyhow, the trick I'm stumped at, is to figure how many points are used to make the line for the first two x-axis markings, so I can properly turn off the solid line up to that point. I'm using the Line.set_dashes() method to turn off the solid line, and I'm making an individual (non-connected) copy and setting the linestyle to dash. This causes the lines to be drawn on top of each other, and the solid to take precedence when ON. However, the Line.set_dashes() takes "points" as arguments. I figured out where, but as you see, the second line has different angles, thus length, so this point is further along the line. Maybe there's a better way to set the line to two styles?
Here is an example plot --> https://flic.kr/p/rin6Z5
r = getPostData(wall)
if len(newTimes) < LIMIT:
LIMIT = len(newTimes)
yLim = int(round(max(r['Likes'].max(), r['Shares'].max()) * 1.2))
xLim = LIMIT
L1A = plt.Line2D(range(LIMIT), r['Likes'], color='b', ls='--')
L1B = plt.Line2D(range(LIMIT), r['Likes'], label='Likes', color='b')
L2A = plt.Line2D(range(LIMIT), r['Shares'], color='r', ls='--')
L2B = plt.Line2D(range(LIMIT), r['Shares'], label='Shares', color='r')
LNull = plt.Line2D(range(LIMIT), r['Shares'], ls='--', label='Recent Data\n(Early collection)', color='k')
dashes = [1,84,7000,1]
dashesNull=[1,7000]
fig = plt.figure()
ax = fig.add_subplot(111, ylim=(0,yLim), xlim=(0,xLim))
ax.add_line(L1A)
ax.add_line(L1B)
ax.add_line(L2A)
ax.add_line(L2B)
ax.add_line(LNull)
ax.legend(bbox_to_anchor=(1.5,1))
L1B.set_dashes(dashes)
L2B.set_dashes(dashes)
LNull.set_dashes(dashesNull)
I would write your self a helper function, something like:
import numpy as np
import matplotlib.pyplot as plt
def split_plot(ax, x, y, low, high, inner_style, outer_style):
"""
Split styling of line based on the x-value
Parameters
----------
x, y : ndarray
Data, must be same length
low, high : float
The low and high threshold values, points for `low < x < high` are
styled using `inner_style` and points for `x < low or x > high` are
styled using `outer_style`
inner_style, outer_style : dict
Dictionary of styles that can be passed to `ax.plot`
Returns
-------
lower, mid, upper : Line2D
The artists for the lower, midddle, and upper ranges
vline_low, vline_hi : Line2D
Vertical lines at the thresholds
hspan : Patch
Patch over middle region
"""
low_mask = x < low
high_mask = x > high
mid_mask = ~np.logical_or(low_mask, high_mask)
low_mask[1:] |= low_mask[:-1]
high_mask[:-1] |= high_mask[1:]
lower, = ax.plot(x[low_mask], y[low_mask], **outer_style)
mid, = ax.plot(x[mid_mask], y[mid_mask], **inner_style)
upper, = ax.plot(x[high_mask], y[high_mask], **outer_style)
# add vertical lines
vline_low = ax.axvline(low, color='k', ls='--')
vline_high = ax.axvline(high, color='k', ls='--')
hspan = ax.axvspan(low, high, color='b', alpha=.25)
return lower, mid, upper, vline_low, vline_high, hspan
Which can obviously be generalized to take 3 line style dictionaries and style information for the vertical lines and the span. You use it like:
inner_style = {'color': 'r', 'lw': 5, 'ls':'--'}
outer_style = {'color': 'r', 'lw': 1, 'ls':'-'}
x = np.linspace(0, 2*np.pi, 1024)
y = np.sin(x)
low = np.pi / 2
high = 3*np.pi / 2
fig, ax = plt.subplots()
lower, mid, upper, vl_low, vl_high, hsp = split_plot(ax, x, y, low, high, inner_style, outer_style)
plt.show()
i thought so too, anyhow, the resulting code is now...
def splitLine(ax, x, y, splitNum, style1, style2):
'''Creates a two styled line given;
ax = an axis
x = an array of x coordinates for 2D Line
y = an array of y coordinates for 2D Line
splitNum = index number to split Line by x tick
style1 = dictionary for left part of Line
style2 = dictionary for right part of Line
'''
split = x[splitNum]
low_mask = x <= split
upper_mask = x >= split
lower, = ax.plot(x[low_mask], y[low_mask], **style1)
upper, = ax.plot(x[upper_mask], y[upper_mask], **style2)
return lower, upper
r = getPostData(wall)
earlyLike = {'color': 'r', 'lw': 1, 'ls': '--'}
agedLike = {'color': 'r', 'lw': 2, 'ls': '-', 'label': 'Likes'}
earlyShare = {'color': 'b', 'lw': 1, 'ls': '--'}
agedShare = {'color': 'b', 'lw': 2, 'ls': '-', 'label': 'Shares'}
fig, ax = plt.subplots()
splitLine(ax, np.array(range(LIMIT)), np.array(r['Likes']), 1, earlyLike, agedLike)
splitLine(ax, np.array(range(LIMIT)), np.array(r['Shares']), 1, earlyShare, agedShare)
I have a bar plot drawn in matplotlib as such:
The x-ticks do not span the entire range of x axis. How do I make that happen?
My code is here:
def counter_proportions(counter):
total = sum(counter.values())
proportions = dict()
for key, value in counter.items():
proportions[key] = float(value)/float(total)
return proportions
def categorical_counter_xlabels(counter):
idxs = dict()
for i, key in enumerate(counter.keys()):
idxs[key] = i
return idxs
# Use this dummy data
detailed_hosts = ['Species1' * 3, 'Species2' * 1000, 'Species3' * 20, 'Species4' * 20]
# Create a detailed version of the counter, which includes the exact species represented.
detailed_hosts = []
counts = Counter(detailed_hosts)
props = counter_proportions(counts)
xpos = categorical_counter_xlabels(counts)
fig = plt.figure(figsize=(16,10))
ax = fig.add_subplot(111)
plt.bar(xpos.values(), props.values(), align='center')
plt.xticks(xpos.values(), xpos.keys(), rotation=90)
plt.xlabel('Host Species')
plt.ylabel('Proportion')
plt.title("Proportion of Reassortant Viruses' Host Species")
plt.savefig('Proportion of Reassortant Viruses Host Species.pdf', bbox_inches='tight')
Manual bar spacing
You can gain manual control over where the locations of your bars are positioned (e.g. spacing between them), you did that but with a dictionary - instead try doing it with a list of integers.
Import scipy
xticks_pos = scipy.arange( len( counts.keys() )) +1
plt.bar( xticks_pos, props.values(), align='center')
If you lack scipy and cannot be bothered to install it, this is what arange() produces:
In [5]: xticks_pos
Out[5]: array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
Controlling the margins
Above deals with spacing between bars, and as #JoeKington mentioned in comments the other parts you can control (e.g. if you do not want to control spacing and instead want to restrict margins, etc.):
plt.axis('tight')
plt.margins(0.05, 0)
plt.xlim(x.min() - width, x.max() + width))
I know pandas supports a secondary Y axis, but I'm curious if anyone knows a way to put a tertiary Y axis on plots. Currently I am achieving this with numpy+pyplot, but it is slow with large data sets.
This is to plot different measurements with distinct units on the same graph for easy comparison (eg: Relative Humidity/Temperature/ and Electrical Conductivity).
So really just curious if anyone knows if this is possible in pandas without too much work.
[Edit] I doubt that there is a way to do this(without too much overhead) however I hope to be proven wrong, as this may be a limitation of matplotlib.
I think this might work:
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
df = DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
fig, ax = plt.subplots()
ax3 = ax.twinx()
rspine = ax3.spines['right']
rspine.set_position(('axes', 1.15))
ax3.set_frame_on(True)
ax3.patch.set_visible(False)
fig.subplots_adjust(right=0.7)
df.A.plot(ax=ax, style='b-')
# same ax as above since it's automatically added on the right
df.B.plot(ax=ax, style='r-', secondary_y=True)
df.C.plot(ax=ax3, style='g-')
# add legend --> take advantage of pandas providing us access
# to the line associated with the right part of the axis
ax3.legend([ax.get_lines()[0], ax.right_ax.get_lines()[0], ax3.get_lines()[0]],\
['A','B','C'], bbox_to_anchor=(1.5, 0.5))
Output:
A simpler solution without plt:
ax1 = df1.plot()
ax2 = ax1.twinx()
ax2.spines['right'].set_position(('axes', 1.0))
df2.plot(ax=ax2)
ax3 = ax1.twinx()
ax3.spines['right'].set_position(('axes', 1.1))
df3.plot(ax=ax3)
....
Using function to achieve this:
def plot_multi(data, cols=None, spacing=.1, **kwargs):
from pandas.plotting._matplotlib.style import get_standard_colors
# Get default color style from pandas - can be changed to any other color list
if cols is None: cols = data.columns
if len(cols) == 0: return
colors = get_standard_colors(num_colors=len(cols))
# First axis
ax = data.loc[:, cols[0]].plot(label=cols[0], color=colors[0], **kwargs)
ax.set_ylabel(ylabel=cols[0])
lines, labels = ax.get_legend_handles_labels()
for n in range(1, len(cols)):
# Multiple y-axes
ax_new = ax.twinx()
ax_new.spines['right'].set_position(('axes', 1 + spacing * (n - 1)))
data.loc[:, cols[n]].plot(ax=ax_new, label=cols[n], color=colors[n % len(colors)], **kwargs)
ax_new.set_ylabel(ylabel=cols[n])
# Proper legend position
line, label = ax_new.get_legend_handles_labels()
lines += line
labels += label
ax.legend(lines, labels, loc=0)
return ax
Example:
from random import randrange
data = pd.DataFrame(dict(
s1=[randrange(-1000, 1000) for _ in range(100)],
s2=[randrange(-100, 100) for _ in range(100)],
s3=[randrange(-10, 10) for _ in range(100)],
))
plot_multi(data.cumsum(), figsize=(10, 5))
Output:
I modified the above answer a bit to make it accept custom x column, well-documented, and more flexible.
You can copy this snippet and use it as a function:
from typing import List, Union
import matplotlib.axes
import pandas as pd
def plot_multi(
data: pd.DataFrame,
x: Union[str, None] = None,
y: Union[List[str], None] = None,
spacing: float = 0.1,
**kwargs
) -> matplotlib.axes.Axes:
"""Plot multiple Y axes on the same chart with same x axis.
Args:
data: dataframe which contains x and y columns
x: column to use as x axis. If None, use index.
y: list of columns to use as Y axes. If None, all columns are used
except x column.
spacing: spacing between the plots
**kwargs: keyword arguments to pass to data.plot()
Returns:
a matplotlib.axes.Axes object returned from data.plot()
Example:
>>> plot_multi(df, figsize=(22, 10))
>>> plot_multi(df, x='time', figsize=(22, 10))
>>> plot_multi(df, y='price qty value'.split(), figsize=(22, 10))
>>> plot_multi(df, x='time', y='price qty value'.split(), figsize=(22, 10))
>>> plot_multi(df[['time price qty'.split()]], x='time', figsize=(22, 10))
See Also:
This code is mentioned in https://stackoverflow.com/q/11640243/2593810
"""
from pandas.plotting._matplotlib.style import get_standard_colors
# Get default color style from pandas - can be changed to any other color list
if y is None:
y = data.columns
# remove x_col from y_cols
if x:
y = [col for col in y if col != x]
if len(y) == 0:
return
colors = get_standard_colors(num_colors=len(y))
if "legend" not in kwargs:
kwargs["legend"] = False # prevent multiple legends
# First axis
ax = data.plot(x=x, y=y[0], color=colors[0], **kwargs)
ax.set_ylabel(ylabel=y[0])
lines, labels = ax.get_legend_handles_labels()
for i in range(1, len(y)):
# Multiple y-axes
ax_new = ax.twinx()
ax_new.spines["right"].set_position(("axes", 1 + spacing * (i - 1)))
data.plot(
ax=ax_new, x=x, y=y[i], color=colors[i % len(colors)], **kwargs
)
ax_new.set_ylabel(ylabel=y[i])
# Proper legend position
line, label = ax_new.get_legend_handles_labels()
lines += line
labels += label
ax.legend(lines, labels, loc=0)
return ax
Here's one way to use it:
plot_multi(df, x='time', y='price qty value'.split(), figsize=(22, 10))