I've spent hours on trying to do what I thought was a simple task, which is to add labels onto an XY plot while using seaborn.
Here's my code
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 8,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
I would like to add to each dot on the plot the text in "species" column.
I've seen many examples using matplotlib but not using seaborn.
Any ideas? Thank you.
One way you can do this is as follows:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
df_iris=sns.load_dataset("iris")
ax = sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 10,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x']+.02, point['y'], str(point['val']))
label_point(df_iris.sepal_length, df_iris.sepal_width, df_iris.species, plt.gca())
Here's a more up-to-date answer that doesn't suffer from the string issue described in the comments.
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
p1 = sns.scatterplot(x='sepal_length', # Horizontal axis
y='sepal_width', # Vertical axis
data=df_iris, # Data source
size = 8,
legend=False)
for line in range(0,df_iris.shape[0]):
p1.text(df_iris.sepal_length[line]+0.01, df_iris.sepal_width[line],
df_iris.species[line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
Thanks to the 2 other answers, here is a function scatter_text that makes it possible to reuse these plots several times.
import seaborn as sns
import matplotlib.pyplot as plt
def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
"""Scatter plot with country codes on the x y coordinates
Based on this answer: https://stackoverflow.com/a/54789170/2641825"""
# Create the scatter plot
p1 = sns.scatterplot(x, y, data=data, size = 8, legend=False)
# Add text besides each point
for line in range(0,data.shape[0]):
p1.text(data[x][line]+0.01, data[y][line],
data[text_column][line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
# Set title and axis labels
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p1
Use the function as follows:
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
scatter_text('sepal_length', 'sepal_width', 'species',
data = df_iris,
title = 'Iris sepals',
xlabel = 'Sepal Length (cm)',
ylabel = 'Sepal Width (cm)')
See also this answer on how to have a function that returns a plot:
https://stackoverflow.com/a/43926055/2641825
Below is a solution that does not iterate over rows in the data frame using the dreaded for loop.
There are many issues regarding iterating over a data frame.
The answer is don't iterate! See this link.
The solution below relies on a function (plotlabel) within the petalplot function, which is called by df.apply.
Now, I know readers will comment on the fact that I use scatter and not lmplot, but that is a bit besides the point.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
def petalplot(df):
def plotlabel(xvar, yvar, label):
ax.text(xvar+0.002, yvar, label)
fig = plt.figure(figsize=(30,10))
ax = sns.scatterplot(x = 'sepal_length', y = 'sepal_width', data=df)
# The magic starts here:
df.apply(lambda x: plotlabel(x['sepal_length'], x['sepal_width'], x['species']), axis=1)
plt.title('Example Plot')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
petalplot(df_iris)
Same idea with Scott Boston's answer, however with Seaborn v0.12+, you can leverage seaborn.FacetGrid.apply to add labels on plots and set up your figure in one go:
import seaborn as sns
import pandas as pd
%matplotlib inline
sns.set_theme()
df_iris = sns.load_dataset("iris")
(
sns.lmplot(
data=df_iris,
x="sepal_length",
y="sepal_width",
fit_reg=False,
height=8,
aspect=2
)
.apply(lambda grid: [
grid.ax.text(r["sepal_length"]+.02, r["sepal_width"], r["species"])
for r in df_iris.to_dict(orient="records")
])
.set(title="Example Plot")
.set_axis_labels("Sepal Length", "Sepal Width")
)
Or, if you don't need to use lmplot, also from v0.12, you can use the seaborn.objects interface. This way we don't need to manually iterate over the Iris dataframe nor refer to df_iris or column names sepal_... multiple times.
import seaborn.objects as so
(
so.Plot(df_iris, x="sepal_length", y="sepal_width", text="species")
.add(so.Dot())
.add(so.Text(halign="left"))
.label(title="Example plot", x="Sepal Length", y="Sepal Width")
.layout(size=(20, 10))
)
This produces the below figure:
Use the powerful declarative API to avoid loops (seaborn>=0.12).
Specifically, put x,y, and annotations into a pandas data frame and call plotting.
Here is an example from my own research work.
import seaborn.objects as so
import pandas as pd
df = pd.DataFrame(..,columns=['phase','P(X=1)','text'])
fig,ax = plt.subplots()
p = so.Plot(df,x='phase',y='P(X=1)',text='text').add(so.Dot(marker='+')).add(so.Text(halign='left'))
p.on(ax).show()
Related
I want to create a heatmap with seaborn, similar to this (with the following code):
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df)
plt.show()
I'd also like to add a new variable (lets say new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])), such as that the values (and possibly the spine and ticks as well) of the y-axis are colored according to the new variable and a second color bar plotted in the same plot to represent the colors of the y-axis values. How can I do that?
This uses the new values to color the y-ticks and the y-tick labels and adds the associated colorbar.
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df)
new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])
# Create the colorbar for y-ticks and labels
norm = plt.Normalize(new_var.min(), new_var.max())
cmap = matplotlib.cm.get_cmap('turbo')
yticks_locations = ax.get_yticks()
yticks_labels = df.index.values
#hide original ticks
ax.tick_params(axis='y', left=False)
ax.set_yticklabels([])
for var, ytick_loc, ytick_label in zip(new_var.values, yticks_locations, yticks_labels):
color = cmap(norm(float(var)))
ax.annotate(ytick_label, xy=(1, ytick_loc), xycoords='data', xytext=(-0.4, ytick_loc),
arrowprops=dict(arrowstyle="-", color=color, lw=1), zorder=0, rotation=90, color=color)
# Add colorbar for y-tick colors
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cb = ax.figure.colorbar(sm)
# Match the seaborn style
cb.outline.set_visible(False)
I found your problem interesting, and inspired by the unanswered comment above:
How do you change the second colorbar position? For example, one on top the other on bottom sides. - Py-ser
I decided to spend a while doing some tests. After a little digging i find that cbar_kws={"orientation": "horizontal"} is the argument for sns.heatmap that makes the colorbars horizontal.
Borrowing the code from the solution and making some changes, you can format your plot the way you want as in:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
# Create data
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
# Default heatmap
ax = sns.heatmap(df, cbar_kws={"orientation": "horizontal"}, square = False, annot = True)
new_var = pd.DataFrame(np.random.random((5,1)), columns=["new variable"])
# Create the colorbar for y-ticks and labels
norm = plt.Normalize(new_var.min(), new_var.max())
cmap = matplotlib.cm.get_cmap('turbo')
yticks_locations = ax.get_yticks()
yticks_labels = df.index.values
#hide original ticks
ax.tick_params(axis='y', left=False)
ax.set_yticklabels([])
for var, ytick_loc, ytick_label in zip(new_var.values, yticks_locations, yticks_labels):
color = cmap(norm(float(var)))
ax.annotate(ytick_label, xy=(1, ytick_loc), xycoords='data', xytext=(-0.4, ytick_loc),
arrowprops=dict(arrowstyle="-", color=color, lw=1), zorder=0, rotation=90, color=color)
# Add colorbar for y-tick colors
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cb = ax.figure.colorbar(sm)
# Match the seaborn style
cb.outline.set_visible(False)
Also, you will notice that I listed the values related to each cell in the heatmap, but just out of curiosity to make it clearer to check that everything was working as expected.
I'm still not very happy with the shape/size of the horizontal colorbar, but I'll keep testing and update any progress by editing this answer!
==========================================
EDIT
just to keep track of the updates, first i tried to change just some parameters of seaborn's heatmap function but wouldn't consider this a major improvement on the task... by adding
ax = sns.heatmap(df, cbar_kws = dict(use_gridspec=True, location="top", shrink =0.6), square = True, annot = True)
I end up with:
I did get to separate the colormap using the matplotlib subplot routine and honestly i believe this is the right way given the parameter control that is possible to get here, by:
# Define two rows for subplots
fig, (cax, ax) = plt.subplots(nrows=2, figsize=(5,5.025), gridspec_kw={"height_ratios":[0.025, 1]})
# Default heatmap
ax = sns.heatmap(df, cbar=False, annot = True)
# colorbar
fig.colorbar(ax.get_children()[0], cax=cax, orientation="horizontal")
plt.show()
I obtained:
Which is still not the prettiest graph I've ever made, but now the position and size of the heatmap can be edited normally within the plt.subplots subroutines that give absolute control over these parameters.
How do I add custom legends on seaborn chart? I am new to seaborn and matplotlib, and custom legends with handles and labels are really confusing.
for target in targets:
sns.distplot(target[['sepal length (cm)']], hist=False, rug=True, label = target)
plt.legend(unique_vals)
sns.distplot(target[['sepal length (cm)']==1], hist=False, rug=True)
I have added the above function to create the chart and label. And then I have added one more distribution plot. How do I add label for the second distribution plot?
I think you need to give more details about your program and data, since the below example plots the legend rightly even without fig.legend().
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
targets = []
for i in range(3):
target = np.random.rand(10)
targets.append(target)
fig, ax = plt.subplots()
for i, target in enumerate(targets):
sns.distplot(target, hist=False, rug=True, label=i)
sns.distplot([ i for i in targets[0] if i > 0.3], hist=False, rug=True, label=len(targets))
#fig.legend()
plt.show()
I am working on trying to add Jitter to my plots using seaborn and matplot plots. I am getting mixed information form what I am reading online. Some information is saying coding needs to be done and other information show it as being as simple as jitter = True. I there another library or something that I should be importing that I am not aware of? Below is the code that I am running and trying to add jitter to:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
headcount_df.describe()
%matplotlib inline
ax = plt.figure(figsize=(12, 6)).gca() # define axis
headcount_df.plot.scatter(x = 'Hour', y = 'TablesOpen', ax = ax, alpha = 0.2)
# auto_price.plot(kind = 'scatter', x = 'city-mpg', y = 'price', ax = ax)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen')# Set text for y axis
ax.set_xlabel('Hour')
ax = sns.kdeplot(headcount_df.loc[:, ['TablesOpen', 'Hour']], shade = True, cmap = 'PuBu')
headcount_df.plot.scatter(x = 'Hour', y = 'TablesOpen', ax = ax, jitter = True)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen')# Set text for y axis
ax.set_xlabel('Hour')
I receive the error: AttributeError: 'PathCollection' object has no property 'jitter' when trying to add the jitter. Any help or more information on this would be much appreciated
To add jitter to a scatter plot, first get a handle to the collection that contains the scatter dots. When a scatter plot is just created on an ax, ax.collections[-1] will be the desired collection.
Calling get_offsets() on the collection gets all the xy coordinates of the dots. Add some small random number to each of them. As in this case all coordinates are integers, adding a random number between 0 and 1 spreads the dots out evenly.
In this case the number of dots is very huge. To better see where the dots are concentrated, they can be made very small (marker=',', linewidth=0, s=1,) and be very transparent (e.g.alpha=0.1).
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
fig, ax = plt.subplots(figsize=(12, 6))
headcount_df.plot.scatter(x='Hour', y='TablesOpen', marker=',', linewidth=0, s=1, alpha=.1, color='crimson', ax=ax)
dots = ax.collections[-1]
offsets = dots.get_offsets()
jittered_offsets = offsets + np.random.uniform(0, 1, offsets.shape)
dots.set_offsets(jittered_offsets)
ax.set_title('Hour vs TablesOpen') # Give the plot a main title
ax.set_ylabel('TablesOpen') # Set text for y axis
ax.set_xlabel('Hour')
ax.set_xticks(range(25))
ax.autoscale(enable=True, tight=True)
plt.tight_layout()
plt.show()
As there are a huge number of points, drawing the 2D kde takes a long time. The time can be reduced by taking a random sample from the rows. Note that to draw a 2D kde, the latest versions of Seaborn want each column as a separate parameter.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
filename = 'https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/JitteredHeadCount.csv'
headcount_df = pd.read_csv(filename)
fig, ax = plt.subplots(figsize=(12, 6))
N = 5000
rand_sel_df = headcount_df.iloc[np.random.choice(range(len(headcount_df)), N)]
ax = sns.kdeplot(rand_sel_df['Hour'], rand_sel_df['TablesOpen'], shade=True, cmap='PuBu', ax=ax)
ax.set_title('Hour vs TablesOpen')
ax.set_xticks(range(25))
plt.tight_layout()
plt.show()
I've spent hours on trying to do what I thought was a simple task, which is to add labels onto an XY plot while using seaborn.
Here's my code
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 8,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
I would like to add to each dot on the plot the text in "species" column.
I've seen many examples using matplotlib but not using seaborn.
Any ideas? Thank you.
One way you can do this is as follows:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
df_iris=sns.load_dataset("iris")
ax = sns.lmplot('sepal_length', # Horizontal axis
'sepal_width', # Vertical axis
data=df_iris, # Data source
fit_reg=False, # Don't fix a regression line
size = 10,
aspect =2 ) # size and dimension
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x']+.02, point['y'], str(point['val']))
label_point(df_iris.sepal_length, df_iris.sepal_width, df_iris.species, plt.gca())
Here's a more up-to-date answer that doesn't suffer from the string issue described in the comments.
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
p1 = sns.scatterplot(x='sepal_length', # Horizontal axis
y='sepal_width', # Vertical axis
data=df_iris, # Data source
size = 8,
legend=False)
for line in range(0,df_iris.shape[0]):
p1.text(df_iris.sepal_length[line]+0.01, df_iris.sepal_width[line],
df_iris.species[line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
plt.title('Example Plot')
# Set x-axis label
plt.xlabel('Sepal Length')
# Set y-axis label
plt.ylabel('Sepal Width')
Thanks to the 2 other answers, here is a function scatter_text that makes it possible to reuse these plots several times.
import seaborn as sns
import matplotlib.pyplot as plt
def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
"""Scatter plot with country codes on the x y coordinates
Based on this answer: https://stackoverflow.com/a/54789170/2641825"""
# Create the scatter plot
p1 = sns.scatterplot(x, y, data=data, size = 8, legend=False)
# Add text besides each point
for line in range(0,data.shape[0]):
p1.text(data[x][line]+0.01, data[y][line],
data[text_column][line], horizontalalignment='left',
size='medium', color='black', weight='semibold')
# Set title and axis labels
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p1
Use the function as follows:
df_iris=sns.load_dataset("iris")
plt.figure(figsize=(20,10))
scatter_text('sepal_length', 'sepal_width', 'species',
data = df_iris,
title = 'Iris sepals',
xlabel = 'Sepal Length (cm)',
ylabel = 'Sepal Width (cm)')
See also this answer on how to have a function that returns a plot:
https://stackoverflow.com/a/43926055/2641825
Below is a solution that does not iterate over rows in the data frame using the dreaded for loop.
There are many issues regarding iterating over a data frame.
The answer is don't iterate! See this link.
The solution below relies on a function (plotlabel) within the petalplot function, which is called by df.apply.
Now, I know readers will comment on the fact that I use scatter and not lmplot, but that is a bit besides the point.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_iris=sns.load_dataset("iris")
def petalplot(df):
def plotlabel(xvar, yvar, label):
ax.text(xvar+0.002, yvar, label)
fig = plt.figure(figsize=(30,10))
ax = sns.scatterplot(x = 'sepal_length', y = 'sepal_width', data=df)
# The magic starts here:
df.apply(lambda x: plotlabel(x['sepal_length'], x['sepal_width'], x['species']), axis=1)
plt.title('Example Plot')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
petalplot(df_iris)
Same idea with Scott Boston's answer, however with Seaborn v0.12+, you can leverage seaborn.FacetGrid.apply to add labels on plots and set up your figure in one go:
import seaborn as sns
import pandas as pd
%matplotlib inline
sns.set_theme()
df_iris = sns.load_dataset("iris")
(
sns.lmplot(
data=df_iris,
x="sepal_length",
y="sepal_width",
fit_reg=False,
height=8,
aspect=2
)
.apply(lambda grid: [
grid.ax.text(r["sepal_length"]+.02, r["sepal_width"], r["species"])
for r in df_iris.to_dict(orient="records")
])
.set(title="Example Plot")
.set_axis_labels("Sepal Length", "Sepal Width")
)
Or, if you don't need to use lmplot, also from v0.12, you can use the seaborn.objects interface. This way we don't need to manually iterate over the Iris dataframe nor refer to df_iris or column names sepal_... multiple times.
import seaborn.objects as so
(
so.Plot(df_iris, x="sepal_length", y="sepal_width", text="species")
.add(so.Dot())
.add(so.Text(halign="left"))
.label(title="Example plot", x="Sepal Length", y="Sepal Width")
.layout(size=(20, 10))
)
This produces the below figure:
Use the powerful declarative API to avoid loops (seaborn>=0.12).
Specifically, put x,y, and annotations into a pandas data frame and call plotting.
Here is an example from my own research work.
import seaborn.objects as so
import pandas as pd
df = pd.DataFrame(..,columns=['phase','P(X=1)','text'])
fig,ax = plt.subplots()
p = so.Plot(df,x='phase',y='P(X=1)',text='text').add(so.Dot(marker='+')).add(so.Text(halign='left'))
p.on(ax).show()
I would like to overplot a swarmplot and regplot in seaborn, so that I can have a y=x line through my swarmplot.
Here is my code:
import matplotlib.pyplot as plt
import seaborn as sns
sns.regplot(y=y, x=x, marker=' ', color='k')
sns.swarmplot(x=x_data, y=y_data)
I don't get any errors when I plot, but the regplot never shows on the plot. How can I fix this?
EDIT: My regplot and swarmplot don't overplot and instead, plot in the same frame but separated by some unspecified y amount. If I flip them so regplot is above the call to swarmplot, regplot doesn't show up at all.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame({"x":x_data,"y":y_data} )
sns.regplot(y="y", x="x", data= df, color='k', scatter_kws={"alpha" : 0.0})
sns.swarmplot(y="y", x="x", data= df)
SECOND EDIT: The double axis solution from below works beautifully!
In principle the approach of plotting a swarmplot and a regplot simulatneously works fine.
The problem here is that you set an empty marker (marker = " "). This destroys the regplot, such that it's not shown. Apparently this is only an issue when plotting several things to the same graph; plotting a single regplot with empty marker works fine.
The solution would be not to specify the marker argument, but instead set the markers invisible by using the scatter_kws argument: scatter_kws={"alpha" : 0.0}.
Here is a complete example:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
## generate some data
n=19; m=9
y_data = []
for i in range(m):
a = (np.random.poisson(lam=0.99-float(i)/m,size=n)+i*.9+np.random.rand(1)*2)
a+=(np.random.rand(n)-0.5)*2
y_data.append(a*m)
y_data = np.array(y_data).flatten()
x_data = np.floor(np.sort(np.random.rand(n*m))*m)
## put them into dataframe
df = pd.DataFrame({"x":x_data,"y":y_data} )
## plotting
sns.regplot(y="y", x="x", data= df, color='k', scatter_kws={"alpha" : 0.0})
sns.swarmplot(x="x", y="y", data= df)
plt.show()
Concerning the edited part of the question:
Since swarmplot is a categorical plot, the axis in the plot still goes from -0.5 to 8.5 and not as the labels suggest from 10 to 18.
A possible workaround is to use two axes and twiny.
fig, ax = plt.subplots()
ax2 = ax.twiny()
sns.swarmplot(x="x", y="y", data= df, ax=ax)
sns.regplot(y="y", x="x", data= df, color='k', scatter_kws={"alpha" : 0.0}, ax=ax2)
ax2.grid(False) #remove grid as it overlays the other plot