I am trying to distinguish weekends from weekdays by either 1) shading the region 2) coloring points with different colors or 3) setting x-axis label marked different for weekend.
Here I am trying the 2nd option — coloring data points for weekend differently. I first created an additional column (Is_Weekday) for distinguish weekends from weekdays. However, it’s not drawn on the same line, but rather draws two lines with different colors. I would like them to be in one line but with different color for values on weekends.
Here’s my code for reproducible data:
import pandas as pd
from datetime import datetime
import plotly.express as px
np.random.seed(42)
rng = pd.date_range('2022-04-10', periods=21, freq='D')
practice_df = pd.DataFrame({ 'Date': rng, 'Val' : np.random.randn(len(rng))})
practice_df = practice_df.set_index('Date')
weekend_list = []
for i in range(len(practice_df)):
if practice_df.index[i].weekday() > 4:
weekend_list.append(True)
else:
weekend_list.append(False)
practice_df['IsWeekend'] = weekend_list
fig = px.line(temp_df,
x=temp_df.index, y='cnt',
color = 'Is_Weekend',
markers=True)
fig.show()
What I want to do would look something like this but coloring data points/line for weekends differently.
Edit:
Thanks so much to #Derek_O, I was able to color weekend with my original dataset. But I'd want to color the friday-saturday line also colored as weekend legend, so I set practice_df.index[i].weekday() >= 4 instead of practice_df.index[i].weekday() > 4.
But would it be possible to have the Friday point to be the same as weekdays.
Also, is it possible to have a straight line connecting the points, not like stairs?
Otherwise, it'd also work if we could shade weekend region like the image at the bottom.
Borrowing from #Rob Raymond's answer here, we can loop through the practice_df two elements at a time, adding a trace to the fig for each iteration of the loop.
We also only want to show the legend category the first time it occurs (so that the legend entries only show each category like True or False once), which is why I've created a new column called "showlegend" that determines whether the legend is shown or not.
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
np.random.seed(42)
rng = pd.date_range('2022-04-10', periods=21, freq='D')
practice_df = pd.DataFrame({ 'Date': rng, 'Val' : np.random.randn(len(rng))})
practice_df = practice_df.set_index('Date')
weekend_list = []
for i in range(len(practice_df)):
if practice_df.index[i].weekday() > 4:
weekend_list.append(True)
else:
weekend_list.append(False)
practice_df['IsWeekend'] = weekend_list
weekend_color_map = {True:0, False:1}
weekend_name_map = {True:"True", False:"False"}
practice_df['color'] = practice_df['IsWeekend'].map(weekend_color_map)
practice_df['name'] = practice_df['IsWeekend'].map(weekend_name_map)
## use the color column since weekend corresponds to 0, nonweekend corresponds to 1
first_weekend_idx = practice_df['color'].loc[practice_df['color'].idxmin()]
first_nonweekend_idx = practice_df['color'].loc[practice_df['color'].idxmax()]
practice_df["showlegend"] = False
showlegendIdx = practice_df.columns.get_indexer(["showlegend"])[0]
practice_df.iat[first_weekend_idx, showlegendIdx] = True
practice_df.iat[first_nonweekend_idx, showlegendIdx] = True
practice_df["showlegend"] = practice_df["showlegend"].astype(object)
fig = go.Figure(
[
go.Scatter(
x=practice_df.index[tn : tn + 2],
y=practice_df['Val'][tn : tn + 2],
mode='lines+markers',
# line_shape="hv",
line_color=px.colors.qualitative.Plotly[practice_df['color'][tn]],
name=practice_df['name'][tn],
legendgroup=practice_df['name'][tn],
showlegend=practice_df['showlegend'][tn],
)
for tn in range(len(practice_df))
]
)
fig.update_layout(legend_title_text='Is Weekend')
fig.show()
Related
I am trying to create sunburst chart using Plotly. My data consists of several different types of journeys of varying steps. Some journeys are 10 steps others are 100. But for the purposes of simplicity, let us consider only 3 steps.
Here is the data -
import pandas as pd
import plotly.express as px
import numpy as np
data = {
'step0' :['home', 'home','product2','product1','home'],
'step1' : ['product1','product1', None, 'product2',None] ,
'step2' : ['product2','checkout', None, None,None] ,
'total_sales' : [50,20,10,0,7]
}
data_df = pd.DataFrame(data)
data_df.head()
I now try to plot these steps in sunburst chart. Because some journeys can be short, the subsequent steps are marked as None in those cases.
data_df = data_df.fillna('end')
plotting code -
fig = px.sunburst(data_df, path=['step0','step1','step2'], values='total_sales', height = 400)
fig.show()
As you can see above, the None have been filled by end because Plotly does not like NAs. But then I do not want to show the end in the sunburst chart.
I want to re-create something like this -
https://bl.ocks.org/kerryrodden/7090426
How can I make this work in Plotly?
One workaround that uses what you already have would be to instead fillna with an empty string like " " so the word "end" doesn't show on the chart. Then you can loop through the marker colors and marker labels in the fig.data[0] object, changing the marker color to transparent "rgba(0,0,0,0)" for every label that matches the empty string.
The only thing is that the hovertemplate will still show information for the part of the sunburst chart we have used our workaround to hide, but the static image will look correct.
For example:
import pandas as pd
import plotly.express as px
import numpy as np
data = {
'step0' :['home', 'home','product2','product1','home'],
'step1' : ['product1','product1', None, 'product2',None] ,
'step2' : ['product2','checkout', None, None,None] ,
'total_sales' : [50,20,10,0,7]
}
data_df = pd.DataFrame(data)
# data_df.head()
data_df = data_df.fillna(" ")
fig = px.sunburst(
data_df,
path=['step0','step1','step2'],
values='total_sales',
color=["red","orange","yellow","green","blue"],
height = 400
)
## set marker colors whose labels are " " to transparent
marker_colors = list(fig.data[0].marker['colors'])
marker_labels = list(fig.data[0]['labels'])
new_marker_colors = ["rgba(0,0,0,0)" if label==" " else color for (color, label) in zip(marker_colors, marker_labels)]
marker_colors = new_marker_colors
fig.data[0].marker['colors'] = marker_colors
fig.show()
My problem is similar to the one encountered on this topic: Change heatmap's yticks for multi-index dataframe
I would like to have yticks every 6 months, with them being the index of my dataframe. But I can't manage to make it work.
The issue is that my dataframe is 13500*290 and the answer given in the link takes a long time and doesn't really work (see image below).
This is an example of my code without the solution from the link, this part works fine for me:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
df = pd.DataFrame(index = pd.date_range(datetime(1984, 6, 10), datetime(2021, 1, 14), freq='1D') )
for i in range(0,290):
df['Pt{0}'.format(i)] = np.random.random(size=len(df))
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df, cmap='PuOr', vmin = np.min(np.min(df)), vmax = np.max(np.max(df)), cbar_kws={"label": "Ice Velocity (m/yr)"})
This part does not work for me and produces the figure below, which shouldn't have the stack of ylabels on the yaxis:
f, ax = plt.subplots(figsize=(20,20))
years = df.index.get_level_values(0)
ytickvalues = [year if index in (2, 7, 12) else '' for index, year in enumerate(years)]
sns.heatmap(df, cmap='PuOr', vmin = np.min(np.min(df)), vmax = np.max(np.max(df)), cbar_kws={"label": "Ice Velocity (m/yr)"}, yticklabels = ytickvalues)
Here are a couple ways to adapt that link for your use case (1 label per 6 months):
Either: Show an empty string except on Jan 1 and Jul 1 (i.e., when %m%d evals to 0101 or 0701)
labels = [date if date.strftime('%m%d') in ['0101', '0701'] else ''
for date in df.index.date]
Or: Show an empty string except every ~365/2 days (i.e., when row % 183 == 0)
labels = [date if row % 183 == 0 else ''
for row, date in enumerate(df.index.date)]
Note that you don't have a MultiIndex, so you can just use df.index.date (no need for get_level_values).
Here is the output with a minimized version of your df:
sns.heatmap(df, cmap='PuOr', cbar_kws={'label': 'Ice Velocity (m/yr)'},
vmin=df.values.min(), vmax=df.values.max(),
yticklabels=labels)
I'm trying to create a plot using Plotly that allows you to select from dropdown menus what features are being plotted on the x and y axis. My approach works, but there's a set of actions that remove the coloring of the points being plotted.
Here's a Colab with the steps to reproduce this written out, and done with minimal code (Plotly plays nice with Colab):
https://colab.research.google.com/drive/19PCS8QH9n6VVN9UBOKMay99VuSXq1QGG?usp=sharing
If you want to use your own environment, the following code will reproduce the issue after you've done the following 2 steps:
Pick one of the two dropdown menus and change the selected value at least one time
Change the selected value on the dropdown menu you have not changed yet
You should then see that the original coloring of the points is lost.
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
def get_correlation_figure_please(merged_df):
cols = [col for col, t in zip(merged_df.columns, merged_df.dtypes) if t != object]
start_dropdown_indices = [0, 0]
# Create the scatter plot of the initially selected variables
fig = px.scatter(
merged_df,
x=cols[start_dropdown_indices[0]],
y=cols[start_dropdown_indices[1]],
color='serial_number_id',
)
# Create the drop-down menus which will be used to choose the desired file characteristics for comparison
drop_downs = []
for axis in ['x', 'y']:
drop_downs.append([
dict(
method = 'update',
args = [
{axis : [merged_df[cols[k]]]},
{'%saxis.title.text'%axis: cols[k]},
# {'color':[merged_df['serial_number_id']],'color_discrete_map':SERIALS_TO_INDEX},
],
label = cols[k]) for k in range(len(cols))
])
# Sets up various apsects of the Plotly figure that is currently being produced. This ranges from
# aethetic things, to setting the dropdown menues as part of the figure
fig.update_layout(
title_x=0.4,
showlegend=False,
updatemenus=[{
'active': start_j,
'buttons': drop_down,
'x': 1.125,
'y': y_height,
'xanchor': 'left',
'yanchor': 'top',
} for drop_down, start_j, y_height in zip(drop_downs, start_dropdown_indices, [1, .85])])
return fig
# Set up a dummy dataframe with 20 points each with 5 featuers
df = pd.DataFrame({str(j):np.random.rand(20) for j in range(5)})
# Set up a column of dummied serial numbers (to be used to decide the coloring of each point)
df['serial_number_id'] = df['1'].map(lambda x : '0' if x < 1/3 else ('1' if x < 2/3 else '2'))
fig = get_correlation_figure_please(df)
fig.show()
Financial time series are often fraught with missing data. And out of the box, plotly handles a series with missing timestamps visually by just displaying a line like below. But the challenge here is that plotly interprets the timestamps as a value, and inserts all missing dates in the figure.
Most of the time, I find that the plot would look better by just completely leaving those dates out.
An example from the plotly docs under https://plotly.com/python/time-series/#hiding-weekends-and-holidays shows how to handle missing dates for some date categories like weekends or holidays using:
fig.update_xaxes(
rangebreaks=[
dict(bounds=["sat", "mon"]), #hide weekends
dict(values=["2015-12-25", "2016-01-01"]) # hide Christmas and New Year's
]
)
The downside here is that your dataset may just as well be missing some data for any other weekday. And of course you would have to specify given dates for holidays for different countries, so are there any other approaches?
Reproducible code:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# data
np.random.seed(1234)
n_obs = 15
frequency = 'D'
daterange = pd.date_range('2020', freq=frequency, periods=n_obs)
values = np.random.randint(low=-5, high=6, size=n_obs).tolist()
df = pd.DataFrame({'time':daterange, 'value':values})
df = df.set_index('time')
df.iloc[0]=100; df['value']=df.value.cumsum()
# Missing timestamps
df.iloc[2:5] = np.nan; df.iloc[8:13] = np.nan
df.dropna(inplace = True)
# plotly figure
fig=go.Figure(go.Scatter(x=df.index, y =df['value']))
fig.update_layout(template = 'plotly_dark')
fig.show()
They key here is still to use the rangebreak attribute. But if you were to follow the approach explained in the linked example, you'd have to include each missing date manually. But the solution to missing data in this case is actually more missing data. And this is why:
1. You can retrieve the timestamps from the beginning and the end of your series, and then
2. build a complete timeline within that period (with possibly more missing dates) using:
dt_all = pd.date_range(start=df.index[0],
end=df.index[-1],
freq = 'D')
3. Next you can isolate the timestamps you do have in df.index that are not in that timeline using:
dt_breaks = [d for d in dt_all_py if d not in dt_obs_py]
4. And finally you can include those timestamps in rangebreaks like so:
fig.update_xaxes(
rangebreaks=[dict(values=dt_breaks)]
)
Plot:
Complete code:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# data
np.random.seed(1234)
n_obs = 15
frequency = 'D'
daterange = pd.date_range('2020', freq=frequency, periods=n_obs)
values = np.random.randint(low=-5, high=6, size=n_obs).tolist()
df = pd.DataFrame({'time':daterange, 'value':values})
df = df.set_index('time')
df.iloc[0]=100; df['value']=df.value.cumsum()
# Missing timestamps
df.iloc[2:5] = np.nan; df.iloc[8:13] = np.nan
df.dropna(inplace = True)
# plotly figure
fig=go.Figure(go.Scatter(x=df.index, y =df['value']))
fig.update_layout(template = 'plotly_dark')
# complete timeline between first and last timestamps
dt_all = pd.date_range(start=df.index[0],
end=df.index[-1],
freq = frequency)
# make sure input and synthetic time series are of the same types
dt_all_py = [d.to_pydatetime() for d in dt_all]
dt_obs_py = [d.to_pydatetime() for d in df.index]
# find which timestamps are missing in the complete timeline
dt_breaks = [d for d in dt_all_py if d not in dt_obs_py]
# remove missing timestamps from visualization
fig.update_xaxes(
rangebreaks=[dict(values=dt_breaks)] # hide timestamps with no values
)
#fig.update_layout(title=dict(text="Some dates are missing, but still displayed"))
fig.update_layout(title=dict(text="Missing dates are excluded by rangebreaks"))
fig.update_xaxes(showgrid=False)
fig.show()
When handle big size data, the 'rangebreaks' method is working but performance is low, change the xaxis type to 'category' is also working.
fig.update_xaxes(type='category')
You can use dtick property. change the tick interval to be one day which should be defined in Milliseconds like 86400000 via the dtick property. Refer the following code:
fig.update_xaxes(dtick=86400000)
I want to plot machine observation data by days separately,
so changes between Current, Temperature etc. can be seen by hour.
Basically I want one plot for each day. Thing is when I make too many of these Jupyter Notebook can't display each one of them and plotly gives error.
f_day --> first day
n_day --> next day
I think of using sub_plots with a shared y-axis but then I don't know how I can put different dates in x-axis
How can I make these with graph objects and sub_plots ? So therefore using only 1 figure object so plots doesn't crash.
Data looks like this
,ID,IOT_ID,DATE,Voltage,Current,Temperature,Noise,Humidity,Vibration,Open,Close
0,9466,5d36edfe125b874a36c6a210,2020-08-06 09:02:00,228.893,4.17,39.9817,73.1167,33.3133,2.05,T,F
1,9467,5d36edfe125b874a36c6a210,2020-08-06 09:03:00,228.168,4.13167,40.0317,69.65,33.265,2.03333,T,F
2,9468,5d36edfe125b874a36c6a210,2020-08-06 09:04:00,228.535,4.13,40.11,71.7,33.1717,2.08333,T,F
3,9469,5d36edfe125b874a36c6a210,2020-08-06 09:05:00,228.597,4.14,40.1683,71.95,33.0417,2.0666700000000002,T,F
4,9470,5d36edfe125b874a36c6a210,2020-08-06 09:06:00,228.405,4.13333,40.2317,71.2167,32.9933,2.0,T,F
Code with display error is this
f_day = pd.Timestamp('2020-08-06 00:00:00')
for day in range(days_between.days):
n_day = f_day + pd.Timedelta('1 days')
fig_df = df[(df["DATE"] >= f_day) & (df["DATE"] <= n_day) & (df["IOT_ID"] == iot_id)]
fig_cn = px.scatter(
fig_df, x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= ("IoT " + iot_id + " " + str(f_day.date())),
range_color= (min_noise,max_noise)
)
f_day = n_day
fig_cn.show()
updated
The question was with respect to plotly not matplotlib. Same approach works. Clearly axis and titles need some beautification
import pandas as pd
import plotly.subplots
import plotly.express as px
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig = plotly.subplots.make_subplots(len(days))
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
mask = (df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id)
splt = px.scatter(df.loc[mask], x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
# select_traces() returns a generator so turn it into a list and take first one
fig.add_trace(list(splt.select_traces())[0], row=i+1, col=1)
fig.show()
It's simple - create the axis that you want to plot on first. Then plot. I've simulated your data as you didn't provide in your question.
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig, ax = plt.subplots(len(days), figsize=[20,10],
sharey=True, sharex=False, gridspec_kw={"hspace":0.4})
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
df.loc[(df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id),].plot(kind="scatter", ax=ax[i], x="DATE", y="Current", c="Noise",
colormap= "turbo", title=f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
ax[i].set_xlabel("") # it's in the titles...
output