Adding outliers to plotly boxplot properly - python

I am building a serie of boxplots with pre calculated data using plotly graphic_objects. My problem comes when I need to send the outliers list for each plot. I did not find a proper way of sendding them.
My code looks like this:
from plotly import graph_objects as go
fig = go.Figure()
fig.add_trace(go.Box(x = df.mes, y = df.json_agg, mean = df.media, q1 = df.p25, median = df.mediana, q3 = df.p75, lowerfence = df.li, upperfence = df.ls))
fig.update_xaxes(
dtick="M1",
tickformat="%m-%Y",
ticklabelmode="period")
fig.show()
And my final plot:
What I need is the outliers propertly shown on top or bottop of each boxplot, not side by side.
Thanks, you all help a lot.

have simulated data to make your code sample work
whenever I try passing q3 plot fails to build
the parameter you need to show outliers is boxpoints https://plotly.com/python/box-plots/#styling-outliers
import plotly.graph_objects as go
import pandas as pd
import numpy as np
S = 1000
df = pd.DataFrame(
{
"mes": np.random.choice(pd.date_range("1-jan-2021", freq="M", periods=10), S),
"json_agg": np.random.uniform(-0.4, 0.5, S) * np.random.uniform(0.1, 1, S),
}
)
df = (
df.groupby("mes", as_index=False)
.apply(
lambda d: d.assign(
media=d["json_agg"].mean(),
p25=np.percentile(d["json_agg"], 25),
p75=np.percentile(d["json_agg"], 75),
mediana=np.percentile(d["json_agg"], 50),
li=np.percentile(d["json_agg"], 20),
ls=np.percentile(d["json_agg"], 80),
)
)
.sort_values("mes")
)
fig = go.Figure()
fig.add_trace(
go.Box(
x=df.mes,
y=df.json_agg,
mean=df.media,
q1=df.p25,
# q3=df.p75,
median=df.mediana,
lowerfence=df.li,
upperfence=df.ls,
boxpoints="outliers",
)
)
# fig.update_xaxes(dtick="M1", tickformat="%m-%Y", ticklabelmode="period")

Related

"Complex" plotting with plottly.express

What I am trying to do is something like this using plotly.express:
It partly worked, but I wish each part of the bars would be different colors
and that it showed the value in the columns 'CBK_total' and 'Estorno_total'
on each individual part of each bar. Don't know if it's possible.
My code:
performance_mes_CBK = px.bar(dados
, x='Ano_Mes_Solicitacao'
, y=['Prop_CBK', 'Prop_Estorno']
, color='Regra'
, barmode='group'
, height=600
, title='Performance Regras')
when asking questions, provide your data as marked up text, not a screen shot. Doing OCR on data is not straight forward
this can be achieved using opacity encoded into rgba(). Understanding that marker_color can be a value or an array
have restructured dataframe to stack y-values into a column, with another column showing which measure it is
can then use for_each_trace() to update marker_color using assigned color and column that has been included in customdata through use of hover_data
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.colors
# simulate data frame... data as images in questions is unusable
s = 100
dados = pd.DataFrame(
{
"Ano_Mes_Solicitacao": np.random.choice(
pd.date_range("1-oct-2021", freq="MS", periods=4), s
),
"Prop_CBK": np.random.randint(20, 50, s),
"Prop_Estorno": np.random.randint(20, 50, s),
"Regra": np.random.choice([0.0, 1.0, 2.0], 100).astype(str),
}
)
dados = dados.groupby(["Ano_Mes_Solicitacao", "Regra"], as_index=False).sum()
# OP code, from simulated dataframe
performance_mes_CBK = px.bar(
dados,
x="Ano_Mes_Solicitacao",
y=["Prop_CBK", "Prop_Estorno"],
color="Regra",
barmode="group",
height=600,
title="Performance Regras",
)
performance_mes_CBK.show()
# restruct dataframe so that data is stacked
d2 = (
dados.set_index(["Ano_Mes_Solicitacao", "Regra"])
.stack()
.to_frame()
.reset_index()
.rename(columns={"level_2": "column", 0: "value"})
)
# utility function to set transparency based on which measure is being displayed
def color_array(t):
r, g, b = plotly.colors.hex_to_rgb(t.marker.color)
return [
f"rgba({r},{g},{b},{1 if v==t.customdata[0] else .6})"
for v in t.customdata.T[0]
]
# use hover_data to create custom data so that measures are identifable
# update marker_color to use transparency function
fig = px.bar(
d2,
x="Ano_Mes_Solicitacao",
y="value",
color="Regra",
barmode="group",
height=600,
hover_data=["column"],
title="Performance Regras",
).for_each_trace(lambda t: t.update(marker_color=color_array(t)))
fig

How to format plotly legend when using marker color?

I want to follow up on this post: Plotly: How to colorcode plotly graph objects bar chart using Python?.
When using plotly express, and specifying 'color', the legend is correctly produced as seen in the post by vestland.
This is my plotly express code:
data = {'x_data': np.random.random_sample((5,)),
'y_data': ['A', 'B', 'C', 'D', 'E'],
'c_data': np.random.randint(1, 100, size=5)
}
df = pd.DataFrame(data=data)
fig = px.bar(df,
x='x_data',
y='y_data',
orientation='h',
color='c_data',
color_continuous_scale='YlOrRd'
)
fig.show()
But when using go.Bar, the legend is incorrectly displayed as illustrated here:
This is my code using graph objects:
bar_trace = go.Bar(name='bar_trace',
x=df['x_data'],
y=df['y_data'],
marker={'color': df['c_data'], 'colorscale': 'YlOrRd'},
orientation='h'
)
layout = go.Layout(showlegend=True)
fig = go.FigureWidget(data=[bar_trace], layout=layout)
fig.show()
I'm learning how to use FigureWidget and it seems it can't use plotly express so I have to learn how to use graph objects to plot. How do I link the legend to the data such that it works like the plotly express example in vestland's post.
This really comes down to understanding what a high level API (plotly express) does. When you specify color in px if it is categorical it creates a trace per value of categorical. Hence the below two ways of creating a figure are mostly equivalent. The legend shows an item for each trace, not for each color.
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
df = pd.DataFrame({"x":np.linspace(0,10,10), "y":np.linspace(5,15,10), "color":np.random.choice(list("ABCD"),10)})
px.bar(df, x="x", y="y", color="color", orientation="h").show()
fig = go.Figure()
for g in df.groupby("color"):
fig.add_trace(go.Bar(x=g[1]["x"], y=g[1]["y"], name=g[0], orientation="h"))
fig
supplementary based on comments
you do not have to use graph objects if you are using FigureWidget() as demonstrated by second figure, create with plotly express and then generate FigureWidget()
for continuous data normal pattern is to use a single trace and a colorbar (also demonstrated in second figure). However if you want a discrete legend, create a trace per value in c_data and use https://plotly.com/python-api-reference/generated/plotly.colors.html sample_colorscale()
import plotly.express as px
import plotly.colors
import plotly.graph_objects as go
import numpy as np
import pandas as pd
# simulate data frame...
df = pd.DataFrame(
{
"x_data": np.linspace(0, 10, 10),
"y_data": np.linspace(5, 15, 10),
"c_data": np.random.randint(0, 4, 10),
}
)
# build a trace per value in c_data using graph objects ... correct legend !!??
bar_traces = [
go.Bar(
name="bar_trace",
x=d["x_data"],
y=d["y_data"],
marker={
"color": plotly.colors.sample_colorscale(
"YlOrRd",
d["c_data"] / df["c_data"].max(),
)
},
orientation="h",
)
for c, d in df.groupby("c_data")
]
layout = go.Layout(showlegend=True)
fig = go.FigureWidget(data=bar_traces, layout=layout)
fig.show()
fig = px.bar(
df,
x="x_data",
y="y_data",
color="c_data",
orientation="h",
color_continuous_scale="YlOrRd",
)
fig = go.FigureWidget(data=fig.data, layout=fig.layout)
fig.show()

Adding counts to Plotly boxplots

I have a relatively simple issue, but cannot find any answer online that addresses it. Starting from a simple boxplot:
import plotly.express as px
df = px.data.iris()
fig = px.box(
df, x='species', y='sepal_length'
)
val_counts = df['species'].value_counts()
I would now like to add val_counts (in this dataset, 50 for each species) to the plots, preferably on either of the following places:
On top of the median line
On top of the max/min line
Inside the hoverbox
How can I achieve this?
The snippet below will set count = 50 for all unique values of df['species'] on top of the max line using fig.add_annotation like this:
for s in df.species.unique():
fig.add_annotation(x=s,
y = df[df['species']==s]['sepal_length'].max(),
text = str(len(df[df['species']==s]['species'])),
yshift = 10,
showarrow = False
)
Plot:
Complete code:
import plotly.express as px
df = px.data.iris()
fig = px.box(
df, x='species', y='sepal_length'
)
for s in df.species.unique():
fig.add_annotation(x=s,
y = df[df['species']==s]['sepal_length'].max(),
text = str(len(df[df['species']==s]['species'])),
yshift = 10,
showarrow = False
)
f = fig.full_figure_for_development(warn=False)
fig.show()
Using same approach that I presented in this answer: Change Plotly Boxplot Hover Data
calculate all the measures a box plot calculates plus the additional measure you want count
overlay bar traces over box plot traces so hover has all measures required
import plotly.express as px
df = px.data.iris()
# summarize data as per same dimensions as boxplot
df2 = df.groupby("species").agg(
**{
m
if isinstance(m, str)
else m[0]: ("sepal_length", m if isinstance(m, str) else m[1])
for m in [
"max",
("q75", lambda s: s.quantile(0.75)),
"median",
("q25", lambda s: s.quantile(0.25)),
"min",
"count",
]
}
).reset_index().assign(y=lambda d: d["max"] - d["min"])
# overlay bar over boxplot
px.bar(
df2,
x="species",
y="y",
base="min",
hover_data={c:not c in ["y","species"] for c in df2.columns},
hover_name="species",
).update_traces(opacity=0.1).add_traces(px.box(df, x="species", y="sepal_length").data)

Set up multiple subplots with moving averages using cufflinks and plotly offline

Im trying to select 4 different product prices from my dataframe and plot their moving average as a subplot (2,2) using plotly cufflinks. I would appreciate if anyone can guide on this.
I tried plotting the price as below.
I came across cufflinks technical analysis which allow me to plot moving average in a cleaner way, however, im not too sure how to apply it yet.
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
from plotly import tools
import plotly.graph_objs as go
trace1= go.Scatter(name=',milk', x=df.Date, y=df['milk'])
trace2= go.Scatter(name='soap', x=df.Date, y=df['soap'])
trace3= go.Scatter(name='rice', x=df.Date, y=df['rice'])
trace4= go.Scatter(name='water', x=df.Date, y=df['water'])
fig = tools.make_subplots(rows=2, cols=2, subplot_titles=('milk', 'soap',
'rice', 'water'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig['layout'].update(height=1000, width=1800, title='supermarket')
plot(fig, filename='supermarket.html')
I would appreciate if someone could teach me how to use plotly cufflinks to plot four moving averages from the selected columns from a dataframe using plotly offline.
Insert the code section below in a Jupyter Notebook to produce the following plot using cufflinks and plotly offline:
Plot:
Code:
# imports
import plotly
from plotly import tools
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
import copy
import plotly.graph_objs as go
####### PART 1 - SETUP AND SAMPLE DATA #######
# setup
display(HTML("<style>.container { width:55% !important; } .widget-select > select {background-color: gainsboro;}</style>"))
init_notebook_mode(connected=True)
np.random.seed(123)
cf.set_config_file(theme='pearl')
# Random data using cufflinks
df = cf.datagen.lines().iloc[:,0:4]
df.columns = ['StockA', 'StockB', 'StockC', 'StockD']
####### PART 2 - FUNCTION FOR MOVING AVERAGES #######
# Function for moving averages
def movingAvg(df, win, keepSource):
"""Add moving averages for all columns in a dataframe.
Arguments:
df -- pandas dataframe
win -- length of movingAvg estimation window
keepSource -- True or False for keep or drop source data in output dataframe
"""
df_temp = df.copy()
# Manage existing column names
colNames = list(df_temp.columns.values).copy()
removeNames = colNames.copy()
i = 0
for col in colNames:
# Make new names for movingAvgs
movingAvgName = colNames[i] + '_MA' #+ str(win)
# Add movingAvgs
df_temp[movingAvgName] = df[col].rolling(window=win).mean()
i = i + 1
# Remove estimates with insufficient window length
df_temp = df_temp.iloc[win:]
# Remove or keep source data
if keepSource == False:
df_temp = df_temp.drop(removeNames,1)
return df_temp
# Add moving averages to df
windowLength = 10
df = movingAvg(df=df, win=windowLength, keepSource = True)
####### PART 3 -PLOTLY RULES #######
# Structure lines / traces for the plots
# trace 1
trace1 = go.Scatter(
x=df.index,
y=df['StockA'],
name='StockA'
)
trace1_ma = go.Scatter(
x=df.index,
y=df['StockA_MA'],
name='StockA_MA'
)
# trace 2
trace2 = go.Scatter(
x=df.index,
y=df['StockB'],
name='StockB'
)
trace2_ma = go.Scatter(
x=df.index,
y=df['StockB_MA'],
name='StockB_MA'
)
# trace 3
trace3 = go.Scatter(
x=df.index,
y=df['StockC'],
name='StockC'
)
trace3_ma = go.Scatter(
x=df.index,
y=df['StockC_MA'],
name='StockC_MA'
)
# trace 4
trace4 = go.Scatter(
x=df.index,
y=df['StockD'],
name='StockD'
)
trace4_ma = go.Scatter(
x=df.index,
y=df['StockD_MA'],
name='StockD_MA'
)
# Structure traces as datasets
data1 = [trace1, trace1_ma]
data2 = [trace2, trace2_ma]
data3 = [trace3, trace3_ma]
data4 = [trace4, trace4_ma]
# Build figures
fig1 = go.Figure(data=data1)
fig2 = go.Figure(data=data2)
fig3 = go.Figure(data=data3)
fig4 = go.Figure(data=data4)
# Subplots setup and layout
figs = cf.subplots([fig1, fig2, fig3, fig4],shape=(2,2))
figs['layout'].update(height=800, width=1200,
title='Stocks with moving averages = '+ str(windowLength))
iplot(figs)

python plotly: box plot using column in dataframe

I am enjoying using plotly and wanted to plot boxplots for my data.
From their website, I do the following:
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
y0 = np.random.randn(50)
y1 = np.random.randn(50)+1
trace0 = go.Box(
y=y0,
name = 'Sample A',
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=y1,
name = 'Sample B',
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
data = [trace0, trace1]
py.iplot(data)
The challenge that I have is that I do not know the total number of "trace" is unknown. For example:
titanic = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv")
I would like to plot a boxplot, by column 'embarked', a boxplot of the 'fare' column. Since the total number of unique values in 'embarked' is unknown, I do not want to hardcode that in.
Does anyone know how I can do this properly in plotly?
Thank you!
You could loop over your unique values in embarked and add a trace for each one. In this case there is also nan which needs separate treatment.
for embarked in titanic.embarked.unique():
import plotly
plotly.offline.init_notebook_mode()
import pandas as pd
import numpy as np
titanic = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv")
traces = list()
for embarked in titanic.embarked.unique():
if str(embarked) == 'nan':
traces.append(plotly.graph_objs.Box(y=titanic[pd.isnull(titanic.embarked)].fare,
name = str(embarked)
)
)
else:
traces.append(plotly.graph_objs.Box(y=titanic[titanic.embarked == embarked].fare,
name = embarked
)
)
plotly.offline.iplot(traces)

Categories