matplotlib subplot - colors are alternate - python

I have a code here which prints various subplots. However, the second subplot is always alternate in color. How do I fix this such that all colors are consistent?
As you can see, the second subplot has its colors opposite of the first and third. This is consistent for every column
hrlist = [hrdata2015, hrdata2016, hrdata2017]
titles = ["2015", "2016", "2017"]
columns = ["Sex","Education Level","Salary Plan","Grade",
"Contract Type","Citizenship", "Division"]
for h in columns:
plt.figure(figsize=(40,40))
j = 0
for i in range(len(hrlist)):
j +=1
plt.subplot(2,2,j)
ax1 = sns.countplot(data=hrlist[i],x= h,hue="HR Status", order = hrlist[i][h].value_counts().index)
ax1.set_title(titles[i])
ax1.legend(loc = "upper right", prop={'size': 12})
if(h=="Education Level" or h=="Grade"):
plt.xticks(fontsize = 9)
elif (h == "Division"):
plt.xticks(rotation = 60, fontsize = 8)
else:
plt.xticks(fontsize = 12)
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2,
height + 1,
'{:1.0f}'.format(height,0),
ha="center",rotation=0)
plt.tight_layout()
plt.subplots_adjust(top=0.948,
bottom=0.115,
left=0.052,
right=0.986,
hspace=0.533,
wspace=0.128)
plt.show()

My guess is your hrdata2016 happen to have the first row being "Inactive", while your hrdata2015 and hrdata2017 both have the first row being "Active". Since you didn't define the hue(color) order, the order in the DataFrame was used. Define hue(color) order by hue_order argument like this:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
df1 = pd.DataFrame([['F', 'Active'],
['M', 'Inactive'],
['F', 'Inactive'],
['M', 'Active'],
['F', 'Inactive'],
['M', 'Inactive'],
['F', 'Active']], columns=['Sex', 'HR Status'])
df2 = df1.drop(0)
df3 = df2.drop(1)
hrlist = [df1, df2, df3]
h = 'Sex'
for i in range(len(hrlist)):
plt.subplot(2,2,i+1)
# ax1 = sns.countplot(data=hrlist[i], x=h, hue="HR Status", order=hrlist[i][h].value_counts().index)
ax1 = sns.countplot(data=hrlist[i], x=h, hue="HR Status",
order=hrlist[i][h].value_counts().index,
hue_order=hrlist[i]["HR Status"].value_counts().index)
plt.show()

Related

Line chart to surface chart

[UPDATE: Sorry for not providing the piece where the author of the codes create example data. I have updated the codes]
I found an example of a 3D mesh line chart that satisfied what I need (colouring change with level on z dimension). However, instead of line, I want surface plot. How can I change the codes to have the 3d surface plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from matplotlib.cm import get_cmap
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d.art3d import Line3DCollection
index_returns = np.random.normal(loc=1e-4, scale=5e-3, size=(783, 9))
index_returns = np.vstack((np.zeros(shape=(1, 9)) + 100, index_returns))
index_prices = np.cumprod(1 + index_returns, axis=0)
window = 261
df = np.zeros(shape=(index_prices.shape[0]-window, 9))
for i in range(window, index_prices.shape[0], 1):
df[i-window] = (index_prices[i]/index_prices[i-window]) - 1
index = pd.date_range('2019-01-01', periods=index_prices.shape[0]-window, freq='B')
columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
df = pd.DataFrame(df, index=index, columns=columns)
# create the figure
fig = plt.figure(figsize=(14.4, 9))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_alpha(1)
# get the cmap to use
cmap = get_cmap('RdYlGn')
# get the slice based on data frame
current_slice = df.values[:261, :]
index_names = df.columns
index_dates = df.index
# list holding the lines
lines = []
# for each index...
for i in range(current_slice.shape[1]):
# get the coordinates
x = np.array(np.arange(current_slice.shape[0]))
y = np.tile(i, current_slice.shape[0])
z = np.array(current_slice[:, i])
# crete points and segments to color
points = np.array([x, y, z]).T.reshape(-1, 1, 3)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a continuous norm to map from data points to colors
norm = plt.Normalize(-0.19, 0.19)
lc = Line3DCollection(segments, cmap=cmap, norm=norm, zorder=current_slice.shape[1]-i)
# Set the values used for colormapping
lc.set_array(z)
lc.set_linewidth(2)
lc.set_color(cmap(z[-1] * 2.5 + 0.5))
lc.set_label(index_names[i])
lines.append(ax.add_collection(lc))
# add the grids
ax.legend(loc='center right', bbox_to_anchor=(1.1, 0.46), fancybox=True, facecolor=(.95,.95,.95,1), framealpha=1, shadow=False, frameon=True, ncol=1, columnspacing=0, prop={'family': 'DejaVu Sans Mono'})
ax.set_zlabel('Rolling Equity 1Y', labelpad=10)
ax.set_zlim(-0.39, 0.39)
ax.set_zticklabels([' '* 3 + '{:.0%}'.format(val) for val in ax.get_zticks()], fontdict={'verticalalignment': 'center', 'horizontalalignment': 'center'})
ax.set_xlabel('Date', labelpad=30)
ax.set_xlim(0, current_slice.shape[0]-1)
ax.set_xticklabels([index_dates[int(val)].strftime('%m/%y') for val in ax.get_xticks()[:-1]] + [''], rotation=0, fontdict={'verticalalignment': 'top', 'horizontalalignment': 'center'})
ax.set_yticks(np.arange(current_slice.shape[1]))
ax.set_yticklabels([index_names[i] for i in range(current_slice.shape[1])], rotation=-15, fontdict={'verticalalignment': 'center', 'horizontalalignment': 'left'})
# show the plot
plt.show()

How to create a grouped scatter plot in python [duplicate]

This question already has answers here:
Color by Column Values in Matplotlib
(6 answers)
Closed 1 year ago.
I am trying to make a simple scatter plot in pyplot using a Pandas DataFrame object, but want an efficient way of plotting two variables but have the symbols dictated by a third column (key). I have tried various ways using df.groupby, but not successfully. A sample df script is below. This colours the markers according to 'key1', but Id like to see a legend with 'key1' categories. Am I close? Thanks.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
plt.show()
You can use scatter for this, but that requires having numerical values for your key1, and you won't have a legend, as you noticed.
It's better to just use plot for discrete categories like this. For example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.show()
If you'd like things to look like the default pandas style, then just update the rcParams with the pandas stylesheet and use its color generator. (I'm also tweaking the legend slightly):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
plt.rcParams.update(pd.tools.plotting.mpl_stylesheet)
colors = pd.tools.plotting._get_standard_colors(len(groups), color_type='random')
fig, ax = plt.subplots()
ax.set_color_cycle(colors)
ax.margins(0.05)
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend(numpoints=1, loc='upper left')
plt.show()
This is simple to do with Seaborn (pip install seaborn) as a oneliner
sns.scatterplot(x_vars="one", y_vars="two", data=df, hue="key1")
:
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1974)
df = pd.DataFrame(
np.random.normal(10, 1, 30).reshape(10, 3),
index=pd.date_range('2010-01-01', freq='M', periods=10),
columns=('one', 'two', 'three'))
df['key1'] = (4, 4, 4, 6, 6, 6, 8, 8, 8, 8)
sns.scatterplot(x="one", y="two", data=df, hue="key1")
Here is the dataframe for reference:
Since you have three variable columns in your data, you may want to plot all pairwise dimensions with:
sns.pairplot(vars=["one","two","three"], data=df, hue="key1")
https://rasbt.github.io/mlxtend/user_guide/plotting/category_scatter/ is another option.
With plt.scatter, I can only think of one: to use a proxy artist:
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
x=ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ccm=x.get_cmap()
circles=[Line2D(range(1), range(1), color='w', marker='o', markersize=10, markerfacecolor=item) for item in ccm((array([4,6,8])-4.0)/4)]
leg = plt.legend(circles, ['4','6','8'], loc = "center left", bbox_to_anchor = (1, 0.5), numpoints = 1)
And the result is:
You can use df.plot.scatter, and pass an array to c= argument defining the color of each point:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
colors = np.where(df["key1"]==4,'r','-')
colors[df["key1"]==6] = 'g'
colors[df["key1"]==8] = 'b'
print(colors)
df.plot.scatter(x="one",y="two",c=colors)
plt.show()
From matplotlib 3.1 onwards you can use .legend_elements(). An example is shown in Automated legend creation. The advantage is that a single scatter call can be used.
In this case:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ax.legend(*sc.legend_elements())
plt.show()
In case the keys were not directly given as numbers, it would look as
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = list("AAABBBCCCC")
labels, index = np.unique(df["key1"], return_inverse=True)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = index, alpha = 0.8)
ax.legend(sc.legend_elements()[0], labels)
plt.show()
You can also try Altair or ggpot which are focused on declarative visualisations.
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
Altair code
from altair import Chart
c = Chart(df)
c.mark_circle().encode(x='x', y='y', color='label')
ggplot code
from ggplot import *
ggplot(aes(x='x', y='y', color='label'), data=df) +\
geom_point(size=50) +\
theme_bw()
It's rather hacky, but you could use one1 as a Float64Index to do everything in one go:
df.set_index('one').sort_index().groupby('key1')['two'].plot(style='--o', legend=True)
Note that as of 0.20.3, sorting the index is necessary, and the legend is a bit wonky.
seaborn has a wrapper function scatterplot that does it more efficiently.
sns.scatterplot(data = df, x = 'one', y = 'two', data = 'key1'])

boxplot show max and min fliers results in TypeError: 'AxesSubplot' object is not subscriptable

I am preparing box plots with a whisker interval of [2,98]. The issue is that I am working with air quality data and have a large range of data points, so the outliers take up the entire figure and overshadow the boxplots. I would like to plot the max and min outliers only and have tried the method from Matplotlib boxplot show only max and min fliers, however, I get an error message that says TypeError: 'AxesSubplot' object is not subscriptable.
Here is my code:
fig,ax = plt.subplots(1, figsize=(8,6))
g = sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98))
fliers = g['fliers']
for fly in fliers:
fdata=fly.get_data
fly.set_data([fdata[0][0],fdata[0][-1],fdata[1][0],fdata[1][-1]])
xvalues = ['Niland', 'El Centro', 'Calexico']
plt.xticks(np.arange(3), xvalues, fontsize=12)
ax.set_ylabel('Ozone MDA8 (ppb)',fontsize=15)
ax.set_ylim(0,105)
plt.show()
Here's some sample data:
mda8 = pd.DataFrame({
'T1':[35.000000, 32.125000, 32.000000, 35.250000, 28.875000, 28.500000, 29.375000, 25.125000, 34.166667, 35.250000],
'T2':[28.375, 30.750, 33.250, 34.000, 32.875, 30.250, 29.875, 100.409, 29.625, 1.232],
'T3':[34.250, 102.232, 28.250, 33.000, 27.625, 21.500, 28.375, 30.250, 3.454, 33.750]})
I need help with plotting the max and min outliers only and am open to doing another method besides the one that I tried here.
EDIT here's the link to my csv file https://drive.google.com/file/d/1E3A0UAYCbSN53JXtfsbrA4i_Phci_JWf/view?usp=sharing
A possible approach could be:
hide the outliers plotted by seaborn.boxplot by passing showfliers = False parameter:
sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98), showfliers = False)
get the list of outliers for each column, find maximum and minimum and plot only them:
outliers = {col: list(stat['fliers']) for col in mda8.columns for stat in boxplot_stats(mda8[col])}
min_max_outliers = {key: [np.min(value), np.max(value)] if value != [] else [] for key, value in outliers.items()}
i = 0
for key, value in min_max_outliers.items():
if value != []:
ax.scatter([i, i], value, marker = 'd', facecolor = 'black')
i += 1
Complete Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.cbook import boxplot_stats
mda8 = pd.DataFrame({'T1': [35.000000, 32.125000, 32.000000, 35.250000, 28.875000, 28.500000, 29.375000, 25.125000, 34.166667, 35.250000],
'T2': [28.375, 30.750, 33.250, 34.000, 32.875, 30.250, 29.875, 100.409, 29.625, 1.232],
'T3': [34.250, 102.232, 28.250, 33.000, 27.625, 21.500, 28.375, 30.250, 3.454, 33.750]})
fig,ax = plt.subplots(1, figsize=(8,6))
sns.boxplot(data=mda8, orient='v', width = 0.7, whis = (2,98), showfliers = False)
outliers = {col: list(stat['fliers']) for col in mda8.columns for stat in boxplot_stats(mda8[col])}
min_max_outliers = {key: [np.min(value), np.max(value)] if value != [] else [] for key, value in outliers.items()}
i = 0
for key, value in min_max_outliers.items():
if value != []:
ax.scatter([i, i], value, marker = 'd', facecolor = 'black')
i += 1
xvalues = ['Niland', 'El Centro', 'Calexico']
plt.xticks(np.arange(3), xvalues, fontsize=12)
ax.set_ylabel('Ozone MDA8 (ppb)',fontsize=15)
ax.set_ylim(0,105)
plt.show()
EDIT
Working on the data your provided, if I plot them as they are:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mda8 = pd.read_csv(r'data/MDA8_allregions.csv')
mda8 = mda8.drop(['date', 'date.1', 'date.2'], axis = 1)
fig, ax = plt.subplots(1, figsize = (8, 6))
sns.boxplot(data = mda8, orient = 'v', width = 0.7, whis = (2, 98), showfliers = True)
plt.show()
I get:
In the code above I change the parameter showfliers = False, in order to hide outliers.
Then, as suggested by JohanC in the comment, a simpler way to plot outliers is to plot min and max for each column:
for i, col in enumerate(mda8.columns, 0):
ax.scatter([i, i], [mda8[col].min(), mda8[col].max()], marker = 'd', facecolor = 'black')
Complete Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mda8 = pd.read_csv(r'data/MDA8_allregions.csv')
mda8 = mda8.drop(['date', 'date.1', 'date.2'], axis = 1)
fig, ax = plt.subplots(1, figsize = (8, 6))
sns.boxplot(data = mda8, orient = 'v', width = 0.7, whis = (2, 98), showfliers = False)
for i, col in enumerate(mda8.columns, 0):
ax.scatter([i, i], [mda8[col].min(), mda8[col].max()], marker = 'd', facecolor = 'black')
plt.show()

Plot multiple subplots as animations

I have two separate subplots that I'm hoping to display as animations. For the subplots below, ax1 displays an animated scatter plot, while ax2 is a scatter now, I'm hoping to alter this to a line plot.
Please note: I've simplified the question to only display relevant info. However I'm hoping to keep the code similar to what it is now.
Below is my attempt:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import pandas as pd
DATA_LIMITS = [0, 15]
def datalimits(*data):
return DATA_LIMITS
fig = plt.figure(figsize=(10,18))
grid = plt.GridSpec(1, 3, wspace=0.4, hspace=0.3)
gridsize = (3, 2)
ax1 = plt.subplot2grid(gridsize, (0, 0), colspan=2, rowspan=2)
ax2 = plt.subplot2grid(gridsize, (2, 0), colspan=2, rowspan=2)
ax1.grid(False)
ax2.grid(False)
ax1.set_xlim(DATA_LIMITS)
ax1.set_ylim(DATA_LIMITS)
line_a, = ax1.plot([], [], 'o', c='red', alpha = 0.5, markersize=5,zorder=3)
line_b, = ax1.plot([], [], 'o', c='blue', alpha = 0.5, markersize=5,zorder=3)
lines=[line_a,line_b]
scat = ax1.scatter([], [], s=20, marker='o', c='white', alpha = 1,zorder=3)
scats=[scat]
line_d = ax2.plot([], [], 'o', c = 'k')
ax2.set_ylim(-6,6)
ax2.set_xlim(0,15)
def plots(tdf, xlim=None, ylim=None, fig=fig, ax=ax1):
df = tdf[1]
if xlim is None: xlim = datalimits(df['X'])
if ylim is None: ylim = datalimits(df['Y'])
for (group, gdf), group_line in zip(df.groupby('group'), lines+scats+line_d):
if group in ['A','B','D']:
group_line.set_data(*gdf[['X','Y']].values.T)
elif group in ['C']:
gdf['X'].values, gdf['Y'].values
scat.set_offsets(gdf[['X','Y']].values)
return [scat] + [line_a,line_b] + [line_d]
n = 9
time = range(n)
d = ({
'A1_X' : [13,14,12,13,11,12,13,12,11,10],
'A1_Y' : [6,6,7,7,7,8,8,8,9,10],
'A2_X' : [7,6,5,7,6,3,4,5,6,6],
'A2_Y' : [11,12,11,10,11,12,10,11,10,9],
'B1_X' : [8,9,8,7,6,7,5,6,7,6],
'B1_Y' : [3,4,3,2,3,4,2,1,2,3],
'B2_X' : [13,14,14,14,13,13,13,12,12,12],
'B2_Y' : [5,4,3,2,4,5,4,6,3,3],
'C1_X' : [5,6,7,5,6,5,6,5,6,5],
'C1_Y' : [10,11,10,11,12,11,10,8,7,6],
'D1_X' : [0,1,2,3,4,5,6,7,8,9],
'D1_Y' : [0,1,2,3,4,3,2,1,0,-1],
})
tuples = [((t, k.split('_')[0][0], int(k.split('_')[0][1:]), k.split('_')[1]), v[i])
for k,v in d.items() for i,t in enumerate(time) ]
df = pd.Series(dict(tuples)).unstack(-1)
df.index.names = ['time', 'group', 'id']
interval_ms = 1000
delay_ms = 2000
ani = animation.FuncAnimation(fig, plots, frames=df.groupby('time'), interval=interval_ms, repeat_delay=delay_ms,)
plt.show()
Edit 3: I've deleted all previous updates to keep things clean; you can still check them out in the edit history.
See if this code does what you want, changes are marked via comments:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import pandas as pd
import numpy as np #<< a new import is required
DATA_LIMITS = [0, 15]
def datalimits(*data):
return DATA_LIMITS
fig = plt.figure(figsize=(10,18))
grid = plt.GridSpec(1, 3, wspace=0.4, hspace=0.3)
gridsize = (3, 2)
ax1 = plt.subplot2grid(gridsize, (0, 0), colspan=2, rowspan=2)
ax2 = plt.subplot2grid(gridsize, (2, 0), colspan=2, rowspan=2)
ax1.grid(False)
ax2.grid(False)
ax1.set_xlim(DATA_LIMITS)
ax1.set_ylim(DATA_LIMITS)
line_a, = ax1.plot([], [], 'o', c='red', alpha = 0.5, markersize=5,zorder=3)
line_b, = ax1.plot([], [], 'o', c='blue', alpha = 0.5, markersize=5,zorder=3)
lines=[line_a,line_b]
scat = ax1.scatter([], [], s=20, marker='o', c='white', alpha = 1,zorder=3)
scats=[scat]
line_d = ax2.plot([], [], '-', c = 'k') ##<< using '-' makes this a line plot
ax2.set_ylim(-6,6)
ax2.set_xlim(0,15)
def plots(tdf, xlim=None, ylim=None, fig=fig, ax=ax1):
df = tdf[1]
if xlim is None: xlim = datalimits(df['X'])
if ylim is None: ylim = datalimits(df['Y'])
for (group, gdf), group_line in zip(df.groupby('group'), lines+scats+line_d):
if group in ['A','B']: #<< 'D' is moved to a new if case
group_line.set_data(*gdf[['X','Y']].values.T)
elif group in ['D']:
if tdf[0]==0: #<< use this to "reset the line" when the animation restarts
## or remove the if/else part here if you want continuous (over-)plotting
group_line.set_data([0,0])
else:
_x,_y=group_line.get_data()
_x=np.append(_x,gdf['X'].values)
_y=np.append(_y,gdf['Y'].values)
group_line.set_data([_x,_y])
elif group in ['C']:
gdf['X'].values, gdf['Y'].values
scat.set_offsets(gdf[['X','Y']].values)
return [scat] + [line_a,line_b] + [line_d]
n = 9
time = range(n)
d = ({
'A1_X' : [13,14,12,13,11,12,13,12,11,10],
'A1_Y' : [6,6,7,7,7,8,8,8,9,10],
'A2_X' : [7,6,5,7,6,3,4,5,6,6],
'A2_Y' : [11,12,11,10,11,12,10,11,10,9],
'B1_X' : [8,9,8,7,6,7,5,6,7,6],
'B1_Y' : [3,4,3,2,3,4,2,1,2,3],
'B2_X' : [13,14,14,14,13,13,13,12,12,12],
'B2_Y' : [5,4,3,2,4,5,4,6,3,3],
'C1_X' : [5,6,7,5,6,5,6,5,6,5],
'C1_Y' : [10,11,10,11,12,11,10,8,7,6],
'D1_X' : [0,1,2,3,4,5,6,7,8,9],
'D1_Y' : [0,1,2,3,4,3,2,1,0,-1],
})
tuples = [((t, k.split('_')[0][0], int(k.split('_')[0][1:]), k.split('_')[1]), v[i])
for k,v in d.items() for i,t in enumerate(time) ]
df = pd.Series(dict(tuples)).unstack(-1)
df.index.names = ['time', 'group', 'id']
interval_ms = 1000
delay_ms = 2000
ani = animation.FuncAnimation(fig, plots, frames=df.groupby('time'), interval=interval_ms, repeat_delay=delay_ms,)
plt.show()

Scatter plots in Pandas/Pyplot: How to plot by category [duplicate]

This question already has answers here:
Color by Column Values in Matplotlib
(6 answers)
Closed 1 year ago.
I am trying to make a simple scatter plot in pyplot using a Pandas DataFrame object, but want an efficient way of plotting two variables but have the symbols dictated by a third column (key). I have tried various ways using df.groupby, but not successfully. A sample df script is below. This colours the markers according to 'key1', but Id like to see a legend with 'key1' categories. Am I close? Thanks.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
plt.show()
You can use scatter for this, but that requires having numerical values for your key1, and you won't have a legend, as you noticed.
It's better to just use plot for discrete categories like this. For example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.show()
If you'd like things to look like the default pandas style, then just update the rcParams with the pandas stylesheet and use its color generator. (I'm also tweaking the legend slightly):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
groups = df.groupby('label')
# Plot
plt.rcParams.update(pd.tools.plotting.mpl_stylesheet)
colors = pd.tools.plotting._get_standard_colors(len(groups), color_type='random')
fig, ax = plt.subplots()
ax.set_color_cycle(colors)
ax.margins(0.05)
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend(numpoints=1, loc='upper left')
plt.show()
This is simple to do with Seaborn (pip install seaborn) as a oneliner
sns.scatterplot(x_vars="one", y_vars="two", data=df, hue="key1")
:
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1974)
df = pd.DataFrame(
np.random.normal(10, 1, 30).reshape(10, 3),
index=pd.date_range('2010-01-01', freq='M', periods=10),
columns=('one', 'two', 'three'))
df['key1'] = (4, 4, 4, 6, 6, 6, 8, 8, 8, 8)
sns.scatterplot(x="one", y="two", data=df, hue="key1")
Here is the dataframe for reference:
Since you have three variable columns in your data, you may want to plot all pairwise dimensions with:
sns.pairplot(vars=["one","two","three"], data=df, hue="key1")
https://rasbt.github.io/mlxtend/user_guide/plotting/category_scatter/ is another option.
With plt.scatter, I can only think of one: to use a proxy artist:
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(111)
x=ax1.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ccm=x.get_cmap()
circles=[Line2D(range(1), range(1), color='w', marker='o', markersize=10, markerfacecolor=item) for item in ccm((array([4,6,8])-4.0)/4)]
leg = plt.legend(circles, ['4','6','8'], loc = "center left", bbox_to_anchor = (1, 0.5), numpoints = 1)
And the result is:
You can use df.plot.scatter, and pass an array to c= argument defining the color of each point:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3), index = pd.date_range('2010-01-01', freq = 'M', periods = 10), columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
colors = np.where(df["key1"]==4,'r','-')
colors[df["key1"]==6] = 'g'
colors[df["key1"]==8] = 'b'
print(colors)
df.plot.scatter(x="one",y="two",c=colors)
plt.show()
From matplotlib 3.1 onwards you can use .legend_elements(). An example is shown in Automated legend creation. The advantage is that a single scatter call can be used.
In this case:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = (4,4,4,6,6,6,8,8,8,8)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = df['key1'], alpha = 0.8)
ax.legend(*sc.legend_elements())
plt.show()
In case the keys were not directly given as numbers, it would look as
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.normal(10,1,30).reshape(10,3),
index = pd.date_range('2010-01-01', freq = 'M', periods = 10),
columns = ('one', 'two', 'three'))
df['key1'] = list("AAABBBCCCC")
labels, index = np.unique(df["key1"], return_inverse=True)
fig, ax = plt.subplots()
sc = ax.scatter(df['one'], df['two'], marker = 'o', c = index, alpha = 0.8)
ax.legend(sc.legend_elements()[0], labels)
plt.show()
You can also try Altair or ggpot which are focused on declarative visualisations.
import numpy as np
import pandas as pd
np.random.seed(1974)
# Generate Data
num = 20
x, y = np.random.random((2, num))
labels = np.random.choice(['a', 'b', 'c'], num)
df = pd.DataFrame(dict(x=x, y=y, label=labels))
Altair code
from altair import Chart
c = Chart(df)
c.mark_circle().encode(x='x', y='y', color='label')
ggplot code
from ggplot import *
ggplot(aes(x='x', y='y', color='label'), data=df) +\
geom_point(size=50) +\
theme_bw()
It's rather hacky, but you could use one1 as a Float64Index to do everything in one go:
df.set_index('one').sort_index().groupby('key1')['two'].plot(style='--o', legend=True)
Note that as of 0.20.3, sorting the index is necessary, and the legend is a bit wonky.
seaborn has a wrapper function scatterplot that does it more efficiently.
sns.scatterplot(data = df, x = 'one', y = 'two', data = 'key1'])

Categories