Python plot category on a day axis - python

Hello Guys I have a data set of Date, Category, and Quantity, I want to plot both date and category on the x-axis and quantity on the y axis. that is a plot of Quantity vs Category for each day in the data frame.

question is tagged as plotly hence a plotly answered
using this documented approach https://plotly.com/python/categorical-axes/#multicategorical-axes
have simulated a dataframe that has same structure as image in your question
have deliberately used strings for dates and interval index
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# simulate dataframe shown in question
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range("7-feb-2022", "17-feb-2022"), range(1, 51, 1)],
names=["Date", "Category"],
),
data=np.random.uniform(1, 25, 550),
columns=["Quantity"],
).reset_index()
df["Category"] = pd.cut(df["Category"], bins=[0, 10, 20, 30, 40, 50]).astype(str)
df = df.groupby(["Date", "Category"]).sum()
# https://plotly.com/python/categorical-axes/#multicategorical-axes
go.Figure(
go.Bar(
x=[
df.index.get_level_values("Date").strftime("%Y-%m-%d").tolist(),
df.index.get_level_values("Category").tolist(),
],
y=df["Quantity"],
)
)

Related

How plot points based on categorical variable in plotly

I am using Plotly for visualization. I want to make plot, and give the points colors based on categorical variable.
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Predicted, y=df.Predicted,colors='Category',mode='markers',
))
fig.add_trace(go.Scatter(x=df.Predicted, y=df.real , colors='Category'
))
fig.show()
where Category is column in my dataframe. How can I do this kind of graph
you have implied a data frame structure which I have simulated
it's simpler to use Plotly Express higher level API that graph objects
have used to calls to px.scatter() to generate traces defined in your question. Plus have renamed traces in second call to ensure legend is clear and made them lines
import numpy as np
import pandas as pd
import plotly.express as px
df = pd.DataFrame(
{
"Predicted": np.sort(np.random.uniform(3, 15, 100)),
"real": np.sort(np.random.uniform(3, 15, 100)),
"Category": np.random.choice(list("ABCD"), 100),
}
)
px.scatter(df, x="Predicted", y="Predicted", color="Category").add_traces(
px.line(df, x="Predicted", y="real", color="Category")
.for_each_trace(
lambda t: t.update(name="real " + t.name)
) # make it clear in legend this is second set of traces
.data
)

Adding outliers to plotly boxplot properly

I am building a serie of boxplots with pre calculated data using plotly graphic_objects. My problem comes when I need to send the outliers list for each plot. I did not find a proper way of sendding them.
My code looks like this:
from plotly import graph_objects as go
fig = go.Figure()
fig.add_trace(go.Box(x = df.mes, y = df.json_agg, mean = df.media, q1 = df.p25, median = df.mediana, q3 = df.p75, lowerfence = df.li, upperfence = df.ls))
fig.update_xaxes(
dtick="M1",
tickformat="%m-%Y",
ticklabelmode="period")
fig.show()
And my final plot:
What I need is the outliers propertly shown on top or bottop of each boxplot, not side by side.
Thanks, you all help a lot.
have simulated data to make your code sample work
whenever I try passing q3 plot fails to build
the parameter you need to show outliers is boxpoints https://plotly.com/python/box-plots/#styling-outliers
import plotly.graph_objects as go
import pandas as pd
import numpy as np
S = 1000
df = pd.DataFrame(
{
"mes": np.random.choice(pd.date_range("1-jan-2021", freq="M", periods=10), S),
"json_agg": np.random.uniform(-0.4, 0.5, S) * np.random.uniform(0.1, 1, S),
}
)
df = (
df.groupby("mes", as_index=False)
.apply(
lambda d: d.assign(
media=d["json_agg"].mean(),
p25=np.percentile(d["json_agg"], 25),
p75=np.percentile(d["json_agg"], 75),
mediana=np.percentile(d["json_agg"], 50),
li=np.percentile(d["json_agg"], 20),
ls=np.percentile(d["json_agg"], 80),
)
)
.sort_values("mes")
)
fig = go.Figure()
fig.add_trace(
go.Box(
x=df.mes,
y=df.json_agg,
mean=df.media,
q1=df.p25,
# q3=df.p75,
median=df.mediana,
lowerfence=df.li,
upperfence=df.ls,
boxpoints="outliers",
)
)
# fig.update_xaxes(dtick="M1", tickformat="%m-%Y", ticklabelmode="period")

How to create "Weekly Boxplots"?

I have dataset which looks like this. I have a data for month of two categories, 62 rows, 31 for each category. I would like to create a weekly boxplots with week number and month on the y-axis [like 01-12, 02-12, 03-12 and so on].
So far I have come up with the following code.
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
fig, ax = plt.subplots(figsize=(18,6))
df.index = pd.to_datetime(df.Timestamp)
sns.boxplot(x=df.index.week, y='Values', data=df, hue='Category', ax=ax)
By Using df.index.week, I am not getting the expected week value, instead it is giving me the week number of year like this.
Guidance please?
You can create a grouping column in your df by formatting values from the Date column:
date_range = pd.date_range(start='2013-12-01', end='2013-12-31').to_list()
df = pd.DataFrame(
{
"Date": date_range + date_range,
"Values": np.random.randint(1000, 20000, 62),
"Category": ["anti"] * 31 + ["pro"] * 31,
}
)
Use pandas.Series.dt.strftime to get the week of the year (%U) and month (%m) joined by a -:
df["week_month"] = df["Date"].dt.strftime("%U-%m")
(Thanks for the better method #Cameron Riddell)
Then plot:
sns.boxplot(x="week_month", y="Values", data=df, hue="Category")

python plotly: box plot using column in dataframe

I am enjoying using plotly and wanted to plot boxplots for my data.
From their website, I do the following:
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
y0 = np.random.randn(50)
y1 = np.random.randn(50)+1
trace0 = go.Box(
y=y0,
name = 'Sample A',
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=y1,
name = 'Sample B',
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
data = [trace0, trace1]
py.iplot(data)
The challenge that I have is that I do not know the total number of "trace" is unknown. For example:
titanic = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv")
I would like to plot a boxplot, by column 'embarked', a boxplot of the 'fare' column. Since the total number of unique values in 'embarked' is unknown, I do not want to hardcode that in.
Does anyone know how I can do this properly in plotly?
Thank you!
You could loop over your unique values in embarked and add a trace for each one. In this case there is also nan which needs separate treatment.
for embarked in titanic.embarked.unique():
import plotly
plotly.offline.init_notebook_mode()
import pandas as pd
import numpy as np
titanic = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv")
traces = list()
for embarked in titanic.embarked.unique():
if str(embarked) == 'nan':
traces.append(plotly.graph_objs.Box(y=titanic[pd.isnull(titanic.embarked)].fare,
name = str(embarked)
)
)
else:
traces.append(plotly.graph_objs.Box(y=titanic[titanic.embarked == embarked].fare,
name = embarked
)
)
plotly.offline.iplot(traces)

How to change bar size plotly time series

I'm working on creating some analysis on a player's games over time for League of Legends. I'm trying to create a histogram using plotly, with the date range on the x axis and no. of games on y. This works but i can't get individual bars for each day, just month. I've tried using the xaxis, 'size' object but this doesnt change anything, I guess because the x axis is in date form.
So question, in Plotly how do I change the size of the bars on the histogram from a monthly bin size to daily bin size?
Here's an example of the code:
from datetime import date, timedelta
import random
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
from plotly import tools
from plotly.offline import *#plotly.offline.iplot()
init_notebook_mode(connected=True)
############## create date ranges #################
d1 = date(2014, 3, 22) # start date
d2 = date(2014, 6, 22) # end date
delta = d2 - d1 # timedelta
dates = []
for i in range(delta.days + 1):
dates.append((d1 + timedelta(days=i)))
#################################################
def games_p_day():
sizeo = 1
trace_total = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'total games',
xbins=dict(
size=sizeo
)
)
trace_wins = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'won games',
xbins=dict(
size=sizeo
)
)
trace_losses = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'lost games',
xbins=dict(
size=sizeo
)
)
layout = dict(
title = "Wins and losses over time",
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
label='1m',
step='month',
stepmode='backward'),
dict(count=6,
label='6m',
step='month',
stepmode='backward'),
dict(step='all')
])
),
rangeslider=dict(),
type='date',
),
bargap=0.2,
bargroupgap=0.1)
data=[trace_total]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Wins and losses over time")
games_p_day()
Any help massively appreciated.
Oh and if you see anything else that could help me (ie. bad code structure) please let me know!
A histogram is the representation of the distribution of numerical data. It seems to me that what you're aiming to de here is an aggregation of data from daily to weekly data. That is, as long as you'd like to have a time dimension and not count, average or any other aggregation function on your x-axis. If this is the case then the key to your challenge lies not in plotly itself but in aggregation and time functions such as resample('W-Mon', on='index').sum(). Here are some examples:
Plot for sampled raw data:
Code for raw data:
import pandas as pd
import numpy as np
import datetime
# data
np.random.seed(12)
numdays=100
dates = pd.date_range('1/1/2020', periods=numdays)
games = np.random.randint(low=100, high=200, size=numdays).tolist()
losses = np.random.randint(low=0, high=100, size=numdays).tolist()
wins = list(np.array(games)-np.array(wins))
df = pd.DataFrame({'games': games,
'wins':wins,
'losses':losses}, index=dates)
# resample daily data to weekly sums
df2=df.reset_index().resample('W-Mon', on='index').sum()
df2['formatted_date'] = pd.to_datetime(df3.index)
df2['year'] = df2.formatted_date.apply(lambda x: x.year)
df2['week_of_year'] = df2.formatted_date.apply(lambda x: x.weekofyear)
df2['year_week'] = df2['year'].map(str)+'_'+df3['week_of_year'].map(str)
# build and show plotly plot for daily games
fig = go.Figure(data=[go.Bar(name='games', x=df.index, y=df['games'])])
fig.show()
Plot for weekly aggregated data. Date as index:
Code for weekly aggregated data. Date as index:
# build and show plotly plot for weekly games. Dates as index
fig = go.Figure(data=[go.Bar(name='games', x=df2.index, y=df2['games'])])
fig.show()
Plot for weekly aggregated data. Year and week number as index:
Code for weekly aggregated data. Year and week number as index:
# build and show plotly plot for weekly games. Year and week number as index
fig = go.Figure(data=[go.Bar(name='games', x=df2['year_week'], y=df2['games'])])
fig.show()
Plot for weekly aggregated data, split on wins and losses:
Code for weekly aggregated data, split on wins and losses:
import pandas as pd
import numpy as np
import datetime
# data
np.random.seed(12)
numdays=100
dates = pd.date_range('1/1/2020', periods=numdays)
games = np.random.randint(low=100, high=200, size=numdays).tolist()
losses = np.random.randint(low=0, high=100, size=numdays).tolist()
wins = list(np.array(games)-np.array(wins))
df = pd.DataFrame({'games': games,
'wins':wins,
'losses':losses}, index=dates)
# resample daily data to weekly sums
df2=df.reset_index().resample('W-Mon', on='index').sum()
df2['formatted_date'] = pd.to_datetime(df3.index)
df2['year'] = df2.formatted_date.apply(lambda x: x.year)
df2['week_of_year'] = df2.formatted_date.apply(lambda x: x.weekofyear)
df2['year_week'] = df2['year'].map(str)+'_'+df3['week_of_year'].map(str)
fig = go.Figure(data=[go.Bar(name='victory', x=df2['year_week'], y=df2['wins']),
go.Bar(name='defeat', x=df2['year_week'], y=df2['losses'])])
fig.update_layout(barmode='group')
fig.show()

Categories