Categorical data in subplots - python

I have a pandas dataframe containing 20 columns containing a mixture of numeric and categorical data. I want to plot a 5x4 matrix of plots of the data. Using matplotlib and subplots I now have plots for all the numeric data but for the life of me I can't work out how to include the categorical data.
I want something like
df['RBC'].value_counts().plot(kind='bar')
But in a subplot.
Here is some of the code (I've omitted some of the repetitions for brevity).
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
import pybel
import pandas as pd
import matplotlib
import matplotlib.pyplot as p
import matplotlib.ticker as plticker
%matplotlib inline
#commandline application to calculate properties
output = !/Applications/ChemAxon/MarvinBeans/bin/evaluate /Users/username/Desktop/SampleFiles/Fragments.sdf -g -e "field('IDNUMBER'); molString('smiles'); logp(); logd('7.4'); apka('1'); bpka('1'); atomCount(); mass(); acceptorcount(); donorcount(); topologicalPolarSurfaceArea(); rotatablebondcount(); refractivity(); ASAHydrophobic('7.4'); ASAPolar('7.4'); atomCount()-atomCount('1');aromaticAtomCount()/(atomCount()-atomCount('1'))"
[line.split(';') for line in output]
cols = ['ID', 'smiles', 'logP', 'logD', 'apKa', 'bpKa', 'atomCount', 'mass', 'HBA', 'HBD', 'TPSA', 'RBC', 'MR', 'ASAh', 'ASAp', 'HAC', 'FractionAromatic']
df = pd.DataFrame([line.split(';') for line in output], columns=cols)
df = df.convert_objects(convert_numeric=True)
#series of calculations using the calculated data to add several categorical numeric and text fields to dataframe.
myLogP = df['logP']
myLogD = df['logD']
myMass = df['mass']
myTPSA = df['TPSA']
myRBC = df['RBC']
myRBCmax = max(myRBC) +1
myHBA = df['HBA']
myHBAmax = max(myHBA) +1
myHBD = df['HBD']
myHBDmax = max(myHBD) +1
myHAC = df['HAC']
myHACmax= range(min(myHAC), max(myHAC) + 1)
myFraromatic = df['FractionAromatic']
fig, axes = plt.subplots(nrows=5, ncols=4)
ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14, ax15, ax17, ax18, ax19 = axes.flat
axis_font = {'fontname':'Arial', 'size':'14'}
title_font = {'fontname':'Arial', 'size':'14', 'color' :'blue'}
loc = plticker.MultipleLocator(base=1.0)
ax0.hist(myLogP, histtype='bar')
ax0.set_title('LogP', title_font)
ax0.set_xlabel('Range of LogP', axis_font)
ax0.set_ylabel('Count')
ax1.hist(myLogD, histtype='bar')
ax1.set_title('LogD', title_font)
ax1.set_xlabel('Range of LogD', axis_font)
ax1.set_ylabel('Count', axis_font)
ax2.hist(myMass, histtype='bar', color = 'red')
ax2.set_title('Mass', title_font)
ax2.set_xlabel('Range of MWt', axis_font)
ax2.set_ylabel('Count', axis_font)
ax3.hist(myTPSA, histtype='bar', color = 'yellow')
ax3.set_title('TPSA', title_font)
ax3.set_xlabel('Range of TPSA', axis_font)
ax3.set_ylabel('Count', axis_font)
#etc.
#ax8 'AZBN' is a categorical text field
ax9.hist(myFraromatic, bins= 10, histtype='bar')
ax9.set_title('Aromatic', title_font)
ax9.set_xlabel('Fraction of Aromatic atoms', axis_font)
ax9.set_ylabel('Count', axis_font)
#further categorical plots
fig.set_size_inches(20, 15)
plt.tight_layout()
plt.show()

You should really post the code you've tried and some sample data. It's impossible to know the best approach otherwise. However, I think you might want to try the following approach which use the matplotlib API rather than pandas and gives you more control over what goes into each plot:
from matplotlib import pyplot as plt
fig, axes = plt.subplots(5, 4) # axes is a numpy array of pyplot Axes
axes = iter(axes.ravel()) # set up an iterator for the set of axes.
categoricals = df.columns[df.dtypes == 'category']
numeric = df.columns[df.dtypes != 'category']
for col in categoricals:
ax = df[col].value_counts().plot(kind='bar', ax=axes.next())
# do other stuff with ax, formatting etc.
# the plot method returns the axis used for the plot for further manipulation
for col in numeric:
ax = df[col].plot(ax=axes.next())
# etc.
This is just to give you ideas as I don't know the specifics of your data and how you want to plot each column, what kind of data types you have etc.

Related

Seaborn boxplot : set median color and set tick label colors to boxes color

I'm using this nice boxplot graph, answer from #Parfait.
I got an out of bound error on j and had to use range(i*5,i*5+5). Why?
I'd like to set the median to a particular color, let's say red. medianprops=dict(color="red") won't work. How to do it?
How to set the y-axis tick labels to the same color as the boxes?
Disclaimer: I don't know what I'm doing.
Here's the code using random data :
# import the required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import matplotlib.colors as mc
import colorsys
# data
df = pd.DataFrame(np.random.normal(np.random.randint(5,15),np.random.randint(1,5),size=(100, 16)), columns=list(string.ascii_uppercase)[:16])
# Boxplot
fig, ax = plt.subplots(figsize=(9, 10))
medianprops=dict(color="red")
ax = sns.boxplot(data=df, orient="h", showfliers=False, palette = "husl")
ax = sns.stripplot(data=df, orient="h", jitter=True, size=7, alpha=0.5, palette = "husl") # show data points
ax.set_title("Title")
plt.xlabel("X label")
def lighten_color(color, amount=0.5):
# --------------------- SOURCE: #IanHincks ---------------------
try:
c = mc.cnames[color]
except:
c = color
c = colorsys.rgb_to_hls(*mc.to_rgb(c))
return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])
for i,artist in enumerate(ax.artists):
# Set the linecolor on the artist to the facecolor, and set the facecolor to None
col = lighten_color(artist.get_facecolor(), 1.2)
artist.set_edgecolor(col)
# Each box has 6 associated Line2D objects (to make the whiskers, fliers, etc.)
# Loop over them here, and use the same colour as above
for j in range(i*5,i*5+5):
line = ax.lines[j]
line.set_color(col)
line.set_mfc(col)
line.set_mec(col)
#line.set_linewidth(0.5)
To change the color of the median, you can use the medianprops in sns.boxplot(..., medianprops=...). If you also set a unique label, that label can be tested again when iterating through the lines.
To know how many lines belong to each boxplot, you can divide the number of lines by the number of artists (just after the boxplot has been created, before other elements have been added to the plot). Note that a line potentially has 3 colors: the line color, the marker face color and the marker edge color. Matplotlib creates the fliers as an invisible line with markers. The code below thus also changes these colors to make it more robust to different options and possible future changes.
Looping simultaneously through the boxes and the y tick labels allows copying the color. Making them a bit larger and darker helps for readability.
import matplotlib.pyplot as plt
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb, to_rgb
import seaborn as sns
import pandas as pd
import numpy as np
def enlighten(color, factor=0.5):
h, s, v = rgb_to_hsv(to_rgb(color))
return hsv_to_rgb((h, s, 1 - factor * (1 - v)))
def endarken(color, factor=0.5):
h, s, v = rgb_to_hsv(to_rgb(color))
return hsv_to_rgb((h, s, factor * v))
df = pd.DataFrame(np.random.normal(1, 5, size=(100, 16)).cumsum(axis=0),
columns=['Hydrogen', 'Helium', 'Lithium', 'Beryllium', 'Boron', 'Carbon', 'Nitrogen', 'Oxygen',
'Fluorine', 'Neon', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Phosphorus', 'Sulfur'])
sns.set_style('white')
fig, ax = plt.subplots(figsize=(9, 10))
colors = sns.color_palette("husl", len(df.columns))
sns.boxplot(data=df, orient="h", showfliers=False, palette='husl',
medianprops=dict(color="yellow", label='median'), ax=ax)
lines_per_boxplot = len(ax.lines) // len(ax.artists)
for i, (box, ytick) in enumerate(zip(ax.artists, ax.get_yticklabels())):
ytick.set_color(endarken(box.get_facecolor()))
ytick.set_fontsize(20)
color = enlighten(box.get_facecolor())
box.set_color(color)
for lin in ax.lines[i * lines_per_boxplot: (i + 1) * lines_per_boxplot]:
if lin.get_label() != 'median':
lin.set_color(color)
lin.set_markerfacecolor(color)
lin.set_markeredgecolor(color)
sns.stripplot(data=df, orient="h", jitter=True, size=7, alpha=0.5, palette='husl', ax=ax)
sns.despine(ax=ax)
ax.set_title("Title")
ax.set_xlabel("X label")
plt.tight_layout()
plt.show()
I just answer point 2. of my question.
After tinkering, I found this to work :
# Each box has 5 associated Line2D objects (the whiskers and median)
# Loop over them here, and use the same colour as above
n=5 # this was for tinkering
for j in range(i*n,i*n+n):
if j != i*n+4 : line = ax.lines[j] # not the median
line.set_color(col)
Again, I don't know what I'm doing. So someone more knowledgeable may provide a more valuable answer.
I removed the stripplot for better clarity.

Pointplot and Scatterplot in one figure but X axis is shifting

Hi I'm trying to plot a pointplot and scatterplot on one graph with the same dataset so I can see the individual points that make up the pointplot.
Here is the code I am using:
xlPath = r'path to data here'
df = pd.concat(pd.read_excel(xlPath, sheet_name=None),ignore_index=True)
sns.pointplot(data=df, x='ID', y='HM (N/mm2)', palette='bright', capsize=0.15, alpha=0.5, ci=95, join=True, hue='Layer')
sns.scatterplot(data=df, x='ID', y='HM (N/mm2)')
plt.show()
When I plot, for some reason the points from the scatterplot are offsetting one ID spot right on the x-axis. When I plot the scatter or the point plot separately, they each are in the correct ID spot. Why would plotting them on the same plot cause the scatterplot to offset one right?
Edit: Tried to make the ID column categorical, but that didn't work either.
Seaborn's pointplot creates a categorical x-axis while here the scatterplot uses a numerical x-axis.
Explicitly making the x-values categorical: df['ID'] = pd.Categorical(df['ID']), isn't sufficient, as the scatterplot still sees numbers. Changing the values to strings does the trick. To get them in the correct order, sorting might be necessary.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# first create some test data
df = pd.DataFrame({'ID': np.random.choice(np.arange(1, 49), 500),
'HM (N/mm2)': np.random.uniform(1, 10, 500)})
df['Layer'] = ((df['ID'] - 1) // 6) % 4 + 1
df['HM (N/mm2)'] += df['Layer'] * 8
df['Layer'] = df['Layer'].map(lambda s: f'Layer {s}')
# sort the values and convert the 'ID's to strings
df = df.sort_values('ID')
df['ID'] = df['ID'].astype(str)
fig, ax = plt.subplots(figsize=(12, 4))
sns.pointplot(data=df, x='ID', y='HM (N/mm2)', palette='bright',
capsize=0.15, alpha=0.5, ci=95, join=True, hue='Layer', ax=ax)
sns.scatterplot(data=df, x='ID', y='HM (N/mm2)', color='purple', ax=ax)
ax.margins(x=0.02)
plt.tight_layout()
plt.show()

Pandas hist subplots - adding colour bar for the colours of each histogram

I have the columns of a dataframe plotted as separate histogram subplots. For each subplot, I want the bars coloured according to the value in a separate list. I have managed this by making a cmap of it and manually cycling those colours, however, is there a way to add a colorbar to the side to show what values these colours belong to? This is what I have right now:
import pandas as pd
import matplotlib as mpl
from matplotlib.colors import rgb2hex
#reading in the data
df = pd.read_csv( "shortlist_temp.dat", sep='\t',header=(0), usecols=(range(1,13)))
#separate list of values
orig_star_teff = [4308.0, 5112.0, 4240.0, 4042.0, 4411.0, 4100.0, 4511.0, 4738.0, 4630.0, 4870.0, 4442.0, 4845.0]
#Colormapping the values. I did not like the result from the original values so I reduced by 4000.
orig_star_teff_norm = [i - 4000 for i in orig_star_teff]
orig_star_teff_norm = [float(i)/max(orig_star_teff_norm) for i in orig_star_teff_norm]
cmap = mpl.cm.plasma
color_list = cmap(orig_star_teff_norm)
color_list2 = [ rgb2hex(color_list[i,:]) for i in range(color_list.shape[0]) ]
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color = color_list2)
ax = df.plot.hist(subplots=True, bins = 12, legend=False, layout=(3, 4), figsize = (15,10), sharey = True)
ax[0,0].set_title('ABOO')
ax[0,1].set_title('EpsVIR')
ax[0,2].set_title('HIP 96014')
ax[0,3].set_title('2M16113361')
ax[1,0].set_title('KIC 3955590')
ax[1,1].set_title('KIC 5113061')
ax[1,2].set_title('KIC 5859492')
ax[1,3].set_title('KIC 6547007')
ax[2,0].set_title('KIC 11444313')
ax[2,1].set_title('KIC 11657684')
ax[2,2].set_title('HD102328-K3III')
ax[2,3].set_title('HD142091-K0III')
Resulting plot
Instead of doing all the normalization steps manually, it probably is easier to create a norm. In this case a norm that maps the values from 4000 till max to the range 0,1 needed for the colormap. Note that converting to hex isn't necessary.
With the norm and the colormap a ScalarMapple can be created with all the necessary information for a colorbar:
import pandas as pd
import matplotlib as mpl
from matplotlib.cm import ScalarMappable
# reading in the data
# df = pd.read_csv("shortlist_temp.dat", sep='\t', header=(0), usecols=(range(1, 13)))
# generating some dummy data
df = pd.DataFrame(np.random.randn(100, 12))
# separate list of values
orig_star_teff = [4308.0, 5112.0, 4240.0, 4042.0, 4411.0, 4100.0, 4511.0, 4738.0, 4630.0, 4870.0, 4442.0, 4845.0]
norm = plt.Normalize(4000, max(orig_star_teff))
cmap = mpl.cm.plasma
color_list = cmap(norm(orig_star_teff))
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=color_list)
axs = df.plot.hist(subplots=True, bins=12, legend=False, layout=(3, 4), figsize=(15, 10), sharey=True)
titles = ['ABOO', 'EpsVIR', 'HIP 96014', '2M16113361',
'KIC 3955590', 'KIC 5113061', 'KIC 5859492', 'KIC 6547007',
'KIC 11444313', 'KIC 11657684', 'HD102328-K3III', 'HD142091-K0III']
for ax, title in zip(axs.flat, titles):
ax.set_title(title)
plt.colorbar(ScalarMappable(cmap=cmap, norm=norm), ax=axs[:, -1])
plt.show()

Wrap xlabels in Seaborn Plot

Have been trying to modify me plot such that the xlabels can be wrapped.
Have looked at few suggestions from similar questions.
But am unable to use them on this.
The ax.set_xticklabels code does not wrap the labels.
The plt.xticks code throws an error -
AttributeError: 'Text' object has no attribute 'expandtabs'
plt.figure(figsize = (7,5))
ax = sns.countplot(data = df3, x = df3.PaymentMethod, hue = df3.Churn)
#ax.set_xticklabels(ax.get_xticklabels(), ha="right", horizontalalignment = 'center', wrap = True)
plt.xticks([textwrap.fill(label, 10) for label in ax.get_xticklabels()],
rotation = 10, fontsize=8, horizontalalignment="center")
Image of plot with overlapping xlabels
textwrap works as expected with the code suggested in the comments:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import seaborn as sns # v 0.11.0
import textwrap
# Create sample dataset
rng = np.random.default_rng(seed=1)
cat_names = ['Short name', 'Slightly longer name', 'Rather much longer name',
'Longest name of them all by far']
counts = rng.integers(10, 100, len(cat_names))
var_cat = np.repeat(cat_names, counts)
var_bool = rng.choice(['True', 'False'], size=len(var_cat))
df = pd.DataFrame(dict(vcat=var_cat, vbool=var_bool))
# Plot seaborn countplot with wrapped tick labels
ax = sns.countplot(data=df, x='vcat', hue='vbool')
labels = [textwrap.fill(label.get_text(), 12) for label in ax.get_xticklabels()]
ax.set_xticklabels(labels);

Seaborn (time series) boxplot using hue and different scale axes

I have a dataframe which has a number of values per date (datetime field). This values are classified in U (users) and S (session) by using a column Group. Seaborn is used to visualize two boxplots per date, where the hue is set to Group.
The problem comes when considering that the values corresponding to U (users) are much bigger than those corresponding to S (session), making the S data illegible. Thus, I need to come up with a solution that allows me to plot both series (U and S) in the same figure in an understandable manner.
I wonder if independent Y axes (with different scales) can be set to each hue, so that both Y axes are shown (as when using twinx but without losing hue visualization capabilities).
Any other alternative would be welcome =)
The S boxplot time series boxplot:
The combined boxplot time series using hue. Obviously it's not possible to see any information about the S group because of the scale of the Y axis:
The columns of the dataframe:
| Day (datetime) | n_data (numeric) | Group (S or U)|
The code line generating the combined boxplot:
seaborn.boxplot(ax=ax,x='Day', y='n_data', hue='Group', data=df,
palette='PRGn', showfliers=False)
Managed to find a solution by using twinx:
fig,ax= plt.subplots(figsize=(50,10))
tmpU = groups.copy()
tmpU.loc[tmp['Group']!='U','n_data'] = np.nan
tmpS = grupos.copy()
tmpS.loc[tmp['Group']!='S','n_data'] = np.nan
ax=seaborn.boxplot(ax=ax,x='Day', y = 'n_data', hue='Group', data=tmpU, palette = 'PRGn', showfliers=False)
ax2 = ax.twinx()
seaborn.boxplot(ax=ax2,x='Day', y = 'n_data', hue='Group', data=tmpS, palette = 'PRGn', showfliers=False)
handles,labels = ax.get_legend_handles_labels()
l= plt.legend(handles[0:2],labels[0:2],loc=1)
plt.setp(ax.get_xticklabels(),rotation=30,horizontalalignment='right')
for label in ax.get_xticklabels()[::2]:
label.set_visible(False)
plt.show()
plt.close('all')
The code above generates the following figure:
Which in this case turns out to be too dense to be published. Therefore I would adopt a visualization based in subplots, as Parfait susgested in his/her answer.
It wasn't an obvious solution to me so I would like to thank Parfait for his/her answer.
Consider building separate plots on same figure with y-axes ranges tailored to subsetted data. Below demonstrates with random data seeded for reproducibility (for readers of this post).
Data (with U values higher than S values)
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
np.random.seed(2018)
u_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,800,20),
'Group': 'U'})
s_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,200,20),
'Group': 'S'})
df = pd.concat([u_df, s_df], ignore_index=True)
df['Day'] = df['Day'].astype('str')
Plot
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.groupby('Group')):
plt.title('N_data of {}'.format(g[0]))
plt.subplot(2, 1, i+1)
seaborn.boxplot(x="Day", y="n_data", data=g[1], palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
To retain original hue and grouping, render all non-group n_data to np.nan:
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.Group.unique()):
plt.subplot(2, 1, i+1)
tmp = df.copy()
tmp.loc[tmp['Group']!=g, 'n_data'] = np.nan
seaborn.boxplot(x="Day", y="n_data", hue="Group", data=tmp,
palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
So one option to do a grouped box plot with two separate axis is to use hue_order= ['value, np.nan] in your argument for sns.boxplot:
fig = plt.figure(figsize=(14,8))
ax = sns.boxplot(x="lon_bucketed", y="value", data=m, hue='name', hue_order=['co2',np.nan],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5 ,palette = customPalette)
ax2 = ax.twinx()
ax2 = sns.boxplot(ax=ax2,x="lon_bucketed", y="value", data=m, hue='name', hue_order=[np.nan,'g_xco2'],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5, palette = customPalette)
ax1.grid(alpha=0.5, which = 'major')
plt.tight_layout()
ax.legend_.remove()
GW = mpatches.Patch(color='seagreen', label='$CO_2$')
WW = mpatches.Patch(color='mediumaquamarine', label='$XCO_2$')
ax, ax2.legend(handles=[GW,WW], loc='upper right',prop={'size': 14}, fontsize=12)
ax.set_title("$XCO_2$ vs. $CO_2$",fontsize=18)
ax.set_xlabel('Longitude [\u00b0]',fontsize=14)
ax.set_ylabel('$CO_2$ [ppm]',fontsize=14)
ax2.set_ylabel('$XCO_2$ [ppm]',fontsize=14)
ax.tick_params(labelsize=14)

Categories