Adding counts to Plotly boxplots - python

I have a relatively simple issue, but cannot find any answer online that addresses it. Starting from a simple boxplot:
import plotly.express as px
df = px.data.iris()
fig = px.box(
df, x='species', y='sepal_length'
)
val_counts = df['species'].value_counts()
I would now like to add val_counts (in this dataset, 50 for each species) to the plots, preferably on either of the following places:
On top of the median line
On top of the max/min line
Inside the hoverbox
How can I achieve this?

The snippet below will set count = 50 for all unique values of df['species'] on top of the max line using fig.add_annotation like this:
for s in df.species.unique():
fig.add_annotation(x=s,
y = df[df['species']==s]['sepal_length'].max(),
text = str(len(df[df['species']==s]['species'])),
yshift = 10,
showarrow = False
)
Plot:
Complete code:
import plotly.express as px
df = px.data.iris()
fig = px.box(
df, x='species', y='sepal_length'
)
for s in df.species.unique():
fig.add_annotation(x=s,
y = df[df['species']==s]['sepal_length'].max(),
text = str(len(df[df['species']==s]['species'])),
yshift = 10,
showarrow = False
)
f = fig.full_figure_for_development(warn=False)
fig.show()

Using same approach that I presented in this answer: Change Plotly Boxplot Hover Data
calculate all the measures a box plot calculates plus the additional measure you want count
overlay bar traces over box plot traces so hover has all measures required
import plotly.express as px
df = px.data.iris()
# summarize data as per same dimensions as boxplot
df2 = df.groupby("species").agg(
**{
m
if isinstance(m, str)
else m[0]: ("sepal_length", m if isinstance(m, str) else m[1])
for m in [
"max",
("q75", lambda s: s.quantile(0.75)),
"median",
("q25", lambda s: s.quantile(0.25)),
"min",
"count",
]
}
).reset_index().assign(y=lambda d: d["max"] - d["min"])
# overlay bar over boxplot
px.bar(
df2,
x="species",
y="y",
base="min",
hover_data={c:not c in ["y","species"] for c in df2.columns},
hover_name="species",
).update_traces(opacity=0.1).add_traces(px.box(df, x="species", y="sepal_length").data)

Related

How to show only color coding in the legend of my plotly scatterplot in python

I'm plotting some PCAs with plotly.express scatterplot function, and coding the samples by region (color) and breed (symbol). When I plot it, the legend show me all 67 different breeds in their combinations of symbols and colors. Is there a way to show only the color categories instead?
My data looks like this:
PC1
PC2
PC3
Breed
Region
Sample1
value
value
value
breed1
Region1
Sample2
value
value
value
breed2
Region1
Sample3
value
value
value
breed3
Region2
Sample4
value
value
value
breed1
Region1
Right now my code is just the basic command:
fig=px.scatter(pca, x="PC2",y="PC1", color="Region", symbol="Breed", labels={
"PC2":"PC2-{}%".format(eigen[1]),
"PC1":"PC1-{}%".format(eigen[0])
})
fig.layout.update(showlegend=True)
fig['layout']['height'] = 800
fig['layout']['width'] = 800
fig.show()
Any ideas?
You can add these lines:
region_lst = []
for trace in fig["data"]:
trace["name"] = trace["name"].split(",")[0]
if trace["name"] not in region_lst and trace["marker"]['symbol'] == 'circle':
trace["showlegend"] = True
region_lst.append(trace["name"])
else:
trace["showlegend"] = False
fig.update_layout(legend_title = "region")
fig.show()
Before adding the lines of code:
After adding the code:
I used this dataframe:
import plotly.express as px
df = px.data.medals_long()
fig = px.scatter(df, y="nation", x="count", color="medal", symbol="count")
specifying color and symbol results in legend being combination of values in respective columns
to have legend only be values in one column, change to use just color
to represent second column as symbols, change each trace to use a list of symbols
have synthesized data based on your description
full code
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.validators.scatter.marker import SymbolValidator
eigen = [0.5, 0.7]
# simulate data
n = 1000
pca = pd.DataFrame(
{
**{f"PC{c}": np.random.uniform(1, 5, n) for c in range(1, 4, 1)},
**{
"Breed": np.random.choice([f"breed{x}" for x in range(67)], n),
"Region": np.random.choice([f"Region{x}" for x in range(10)], n),
},
}
)
# just color by Region
fig = px.scatter(
pca,
x="PC2",
y="PC1",
color="Region",
labels={"PC2": "PC2-{}%".format(eigen[1]), "PC1": "PC1-{}%".format(eigen[0])},
)
# build dict that maps as Breed to a symbol
symbol_map = {
t: s for t, s in zip(np.sort(pca["Breed"].unique()), SymbolValidator().values[2::3])
}
# for each trace update marker symbol to list of symbols that correspond to Breed
for t, s in zip(
fig.data, pca.groupby("Region")["Breed"].agg(lambda x: [symbol_map[v] for v in x])
):
t.update(marker_symbol=s)
fig

"Complex" plotting with plottly.express

What I am trying to do is something like this using plotly.express:
It partly worked, but I wish each part of the bars would be different colors
and that it showed the value in the columns 'CBK_total' and 'Estorno_total'
on each individual part of each bar. Don't know if it's possible.
My code:
performance_mes_CBK = px.bar(dados
, x='Ano_Mes_Solicitacao'
, y=['Prop_CBK', 'Prop_Estorno']
, color='Regra'
, barmode='group'
, height=600
, title='Performance Regras')
when asking questions, provide your data as marked up text, not a screen shot. Doing OCR on data is not straight forward
this can be achieved using opacity encoded into rgba(). Understanding that marker_color can be a value or an array
have restructured dataframe to stack y-values into a column, with another column showing which measure it is
can then use for_each_trace() to update marker_color using assigned color and column that has been included in customdata through use of hover_data
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.colors
# simulate data frame... data as images in questions is unusable
s = 100
dados = pd.DataFrame(
{
"Ano_Mes_Solicitacao": np.random.choice(
pd.date_range("1-oct-2021", freq="MS", periods=4), s
),
"Prop_CBK": np.random.randint(20, 50, s),
"Prop_Estorno": np.random.randint(20, 50, s),
"Regra": np.random.choice([0.0, 1.0, 2.0], 100).astype(str),
}
)
dados = dados.groupby(["Ano_Mes_Solicitacao", "Regra"], as_index=False).sum()
# OP code, from simulated dataframe
performance_mes_CBK = px.bar(
dados,
x="Ano_Mes_Solicitacao",
y=["Prop_CBK", "Prop_Estorno"],
color="Regra",
barmode="group",
height=600,
title="Performance Regras",
)
performance_mes_CBK.show()
# restruct dataframe so that data is stacked
d2 = (
dados.set_index(["Ano_Mes_Solicitacao", "Regra"])
.stack()
.to_frame()
.reset_index()
.rename(columns={"level_2": "column", 0: "value"})
)
# utility function to set transparency based on which measure is being displayed
def color_array(t):
r, g, b = plotly.colors.hex_to_rgb(t.marker.color)
return [
f"rgba({r},{g},{b},{1 if v==t.customdata[0] else .6})"
for v in t.customdata.T[0]
]
# use hover_data to create custom data so that measures are identifable
# update marker_color to use transparency function
fig = px.bar(
d2,
x="Ano_Mes_Solicitacao",
y="value",
color="Regra",
barmode="group",
height=600,
hover_data=["column"],
title="Performance Regras",
).for_each_trace(lambda t: t.update(marker_color=color_array(t)))
fig

How to highlight a single data point on a scatter plot using plotly express

In a scatter plot created using px.scatter, how do I mark one data point with a red star?
fig = px.scatter(df, x="sepal_width", y="sepal_length")
# Now set a single data point to color="red", symbol="star".
This isn't really highlighting an already existing data point within a trace you've already produced, but rather adding another one with a different visual appearance. But it does exactly what you're looking for:
fig.add_trace(go.Scatter(x=[3.5], y=[6.5], mode = 'markers',
marker_symbol = 'star',
marker_size = 15))
Plot:
Complete code:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
df = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df, x="sepal_width", y="sepal_length")
fig.add_trace(go.Scatter(x=[3.5], y=[6.5], mode = 'markers',
marker_symbol = 'star',
marker_size = 15))
fig.show()
This directly modifies the Scatter trace's Marker itself:
import plotly.express as px
df = px.data.iris()
fig = px.scatter(df, x="sepal_width", y="sepal_length")
trace = next(fig.select_traces())
# Modify kth point.
n = len(trace.x)
k = 136
color = [trace.marker.color] * n
color[k] = "red"
size = [8] * n
size[k] = 15
symbol = [trace.marker.symbol] * n
symbol[k] = "star"
# Update trace.
trace.marker.color = color
trace.marker.size = size
trace.marker.symbol = symbol
# Alternatively, call:
# fig.update_traces(marker=dict(color=color, size=size, symbol=symbol))
fig.show()

Plotly: How to annotate end of multiple lines with text and marker colors that match the lines?

The post Plotly: Annotate marker at the last value in line chart
shows how to annotate end of lines with text and an individual marker. But how can you do the same thing for multiple lines and at the same time set the associated text and markers to match the color of all lines?
Example plot:
Code with sample dataset:
# imports
import pandas as pd
import plotly.express as px
# data
df = px.data.stocks()
colors = px.colors.qualitative.T10
# plotly
fig = px.line(df,
x = 'date',
y = [c for c in df.columns if c != 'date'],
template = 'plotly_dark',
color_discrete_sequence = colors,
title = 'Stocks',
)
fig.show()
You can address the features of each trace and build new traces for your end markers and text through:
for i, d in enumerate(fig.data):
fig.add_scatter(x=[d.x[-1]], y = [d.y[-1]], [...])
If you've specified colors when building your figure you can also retrieve trace colors and set colors for markers and fonts like this:
textfont = dict(color=d.line.color),
marker = dict(color = d.line.color, size = 12)
Plot:
The figure was being a bit crowded so I dropped one of the stocks. I also made room for the annotations by changing the position of the legend through fig.layout.legend.x = -0.3
Complete code:
# imports
import pandas as pd
import plotly.express as px
# data
df = px.data.stocks()
df = df.drop('AMZN', axis = 1)
colors = px.colors.qualitative.T10
# plotly
fig = px.line(df,
x = 'date',
y = [c for c in df.columns if c != 'date'],
template = 'plotly_dark',
color_discrete_sequence = colors,
title = 'Stocks',
)
# move legend
fig.layout.legend.x = -0.3
# add traces for annotations and text for end of lines
for i, d in enumerate(fig.data):
fig.add_scatter(x=[d.x[-1]], y = [d.y[-1]],
mode = 'markers+text',
text = d.y[-1],
textfont = dict(color=d.line.color),
textposition='middle right',
marker = dict(color = d.line.color, size = 12),
legendgroup = d.name,
showlegend=False)
fig.show()

how to plot a range with a line in the center with Plotly, in Python [duplicate]

How can I use Plotly to produce a line plot with a shaded standard deviation? I am trying to achieve something similar to seaborn.tsplot. Any help is appreciated.
The following approach is fully flexible with regards to the number of columns in a pandas dataframe and uses the default color cycle of plotly. If the number of lines exceed the number of colors, the colors will be re-used from the start. As of now px.colors.qualitative.Plotly can be replaced with any hex color sequence that you can find using px.colors.qualitative:
Alphabet = ['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656', '#1...
Alphabet_r = ['#FA0087', '#FBE426', '#B00068', '#FC1CBF', '#C075A6', '...
[...]
Complete code:
# imports
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import numpy as np
# sample data in a pandas dataframe
np.random.seed(1)
df=pd.DataFrame(dict(A=np.random.uniform(low=-1, high=2, size=25).tolist(),
B=np.random.uniform(low=-4, high=3, size=25).tolist(),
C=np.random.uniform(low=-1, high=3, size=25).tolist(),
))
df = df.cumsum()
# define colors as a list
colors = px.colors.qualitative.Plotly
# convert plotly hex colors to rgba to enable transparency adjustments
def hex_rgba(hex, transparency):
col_hex = hex.lstrip('#')
col_rgb = list(int(col_hex[i:i+2], 16) for i in (0, 2, 4))
col_rgb.extend([transparency])
areacol = tuple(col_rgb)
return areacol
rgba = [hex_rgba(c, transparency=0.2) for c in colors]
colCycle = ['rgba'+str(elem) for elem in rgba]
# Make sure the colors run in cycles if there are more lines than colors
def next_col(cols):
while True:
for col in cols:
yield col
line_color=next_col(cols=colCycle)
# plotly figure
fig = go.Figure()
# add line and shaded area for each series and standards deviation
for i, col in enumerate(df):
new_col = next(line_color)
x = list(df.index.values+1)
y1 = df[col]
y1_upper = [(y + np.std(df[col])) for y in df[col]]
y1_lower = [(y - np.std(df[col])) for y in df[col]]
y1_lower = y1_lower[::-1]
# standard deviation area
fig.add_traces(go.Scatter(x=x+x[::-1],
y=y1_upper+y1_lower,
fill='tozerox',
fillcolor=new_col,
line=dict(color='rgba(255,255,255,0)'),
showlegend=False,
name=col))
# line trace
fig.add_traces(go.Scatter(x=x,
y=y1,
line=dict(color=new_col, width=2.5),
mode='lines',
name=col)
)
# set x-axis
fig.update_layout(xaxis=dict(range=[1,len(df)]))
fig.show()
I was able to come up with something similar. I post the code here to be used by someone else or for any suggestions for improvements.
import matplotlib
import random
import plotly.graph_objects as go
import numpy as np
#random color generation in plotly
hex_colors_dic = {}
rgb_colors_dic = {}
hex_colors_only = []
for name, hex in matplotlib.colors.cnames.items():
hex_colors_only.append(hex)
hex_colors_dic[name] = hex
rgb_colors_dic[name] = matplotlib.colors.to_rgb(hex)
data = [[1, 3, 5, 4],
[2, 3, 5, 4],
[1, 1, 4, 5],
[2, 3, 5, 4]]
#calculating mean and standard deviation
mean=np.mean(data,axis=0)
std=np.std(data,axis=0)
#draw figure
fig = go.Figure()
c = random.choice(hex_colors_only)
fig.add_trace(go.Scatter(x=np.arange(4), y=mean+std,
mode='lines',
line=dict(color=c,width =0.1),
name='upper bound'))
fig.add_trace(go.Scatter(x=np.arange(4), y=mean,
mode='lines',
line=dict(color=c),
fill='tonexty',
name='mean'))
fig.add_trace(go.Scatter(x=np.arange(4), y=mean-std,
mode='lines',
line=dict(color=c, width =0.1),
fill='tonexty',
name='lower bound'))
fig.show()
Great custom responses posted by others. In case someone is interested in code from the official plotly website, see here: https://plotly.com/python/continuous-error-bars/
I wrote a function to extend plotly.express.line with the same high level interface of Plotly Express. The line function (source code below) is used in the same exact way as plotly.express.line but allows for continuous error bands with the flag argument error_y_mode which can be either 'band' or 'bar'. In the second case it produces the same result as the original plotly.express.line. Here is an usage example:
import plotly.express as px
df = px.data.gapminder().query('continent=="Americas"')
df = df[df['country'].isin({'Argentina','Brazil','Colombia'})]
df['lifeExp std'] = df['lifeExp']*.1 # Invent some error data...
for error_y_mode in {'band', 'bar'}:
fig = line(
data_frame = df,
x = 'year',
y = 'lifeExp',
error_y = 'lifeExp std',
error_y_mode = error_y_mode, # Here you say `band` or `bar`.
color = 'country',
title = f'Using error {error_y_mode}',
markers = '.',
)
fig.show()
which produces the following two plots:
The source code of the line function that extends plotly.express.line is this:
import plotly.express as px
import plotly.graph_objs as go
def line(error_y_mode=None, **kwargs):
"""Extension of `plotly.express.line` to use error bands."""
ERROR_MODES = {'bar','band','bars','bands',None}
if error_y_mode not in ERROR_MODES:
raise ValueError(f"'error_y_mode' must be one of {ERROR_MODES}, received {repr(error_y_mode)}.")
if error_y_mode in {'bar','bars',None}:
fig = px.line(**kwargs)
elif error_y_mode in {'band','bands'}:
if 'error_y' not in kwargs:
raise ValueError(f"If you provide argument 'error_y_mode' you must also provide 'error_y'.")
figure_with_error_bars = px.line(**kwargs)
fig = px.line(**{arg: val for arg,val in kwargs.items() if arg != 'error_y'})
for data in figure_with_error_bars.data:
x = list(data['x'])
y_upper = list(data['y'] + data['error_y']['array'])
y_lower = list(data['y'] - data['error_y']['array'] if data['error_y']['arrayminus'] is None else data['y'] - data['error_y']['arrayminus'])
color = f"rgba({tuple(int(data['line']['color'].lstrip('#')[i:i+2], 16) for i in (0, 2, 4))},.3)".replace('((','(').replace('),',',').replace(' ','')
fig.add_trace(
go.Scatter(
x = x+x[::-1],
y = y_upper+y_lower[::-1],
fill = 'toself',
fillcolor = color,
line = dict(
color = 'rgba(255,255,255,0)'
),
hoverinfo = "skip",
showlegend = False,
legendgroup = data['legendgroup'],
xaxis = data['xaxis'],
yaxis = data['yaxis'],
)
)
# Reorder data as said here: https://stackoverflow.com/a/66854398/8849755
reordered_data = []
for i in range(int(len(fig.data)/2)):
reordered_data.append(fig.data[i+int(len(fig.data)/2)])
reordered_data.append(fig.data[i])
fig.data = tuple(reordered_data)
return fig

Categories